linux-brain/drivers/infiniband/hw/hfi1/init.c

2114 lines
55 KiB
C
Raw Normal View History

/*
IB/{hfi1, rdmavt, qib}: Implement CQ completion vector support Currently the driver doesn't support completion vectors. These are used to indicate which sets of CQs should be grouped together into the same vector. A vector is a CQ processing thread that runs on a specific CPU. If an application has several CQs bound to different completion vectors, and each completion vector runs on different CPUs, then the completion queue workload is balanced. This helps scale as more nodes are used. Implement CQ completion vector support using a global workqueue where a CQ entry is queued to the CPU corresponding to the CQ's completion vector. Since the workqueue is global, it's guaranteed to always be there when queueing CQ entries; Therefore, the RCU locking for cq->rdi->worker in the hot path is superfluous. Each completion vector is assigned to a different CPU. The number of completion vectors available is computed by taking the number of online, physical CPUs from the local NUMA node and subtracting the CPUs used for kernel receive queues and the general interrupt. Special use cases: * If there are no CPUs left for completion vectors, the same CPU for the general interrupt is used; Therefore, there would only be one completion vector available. * For multi-HFI systems, the number of completion vectors available for each device is the total number of completion vectors in the local NUMA node divided by the number of devices in the same NUMA node. If there's a division remainder, the first device to get initialized gets an extra completion vector. Upon a CQ creation, an invalid completion vector could be specified. Handle it as follows: * If the completion vector is less than 0, set it to 0. * Set the completion vector to the result of the passed completion vector moded with the number of device completion vectors available. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-02 22:43:55 +09:00
* Copyright(c) 2015 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* BSD LICENSE
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* - Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <linux/pci.h>
#include <linux/netdevice.h>
#include <linux/vmalloc.h>
#include <linux/delay.h>
#include <linux/xarray.h>
#include <linux/module.h>
#include <linux/printk.h>
#include <linux/hrtimer.h>
#include <linux/bitmap.h>
mm: replace all open encodings for NUMA_NO_NODE Patch series "Replace all open encodings for NUMA_NO_NODE", v3. All these places for replacement were found by running the following grep patterns on the entire kernel code. Please let me know if this might have missed some instances. This might also have replaced some false positives. I will appreciate suggestions, inputs and review. 1. git grep "nid == -1" 2. git grep "node == -1" 3. git grep "nid = -1" 4. git grep "node = -1" This patch (of 2): At present there are multiple places where invalid node number is encoded as -1. Even though implicitly understood it is always better to have macros in there. Replace these open encodings for an invalid node number with the global macro NUMA_NO_NODE. This helps remove NUMA related assumptions like 'invalid node' from various places redirecting them to a common definition. Link: http://lkml.kernel.org/r/1545127933-10711-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com> Reviewed-by: David Hildenbrand <david@redhat.com> Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com> [ixgbe] Acked-by: Jens Axboe <axboe@kernel.dk> [mtip32xx] Acked-by: Vinod Koul <vkoul@kernel.org> [dmaengine.c] Acked-by: Michael Ellerman <mpe@ellerman.id.au> [powerpc] Acked-by: Doug Ledford <dledford@redhat.com> [drivers/infiniband] Cc: Joseph Qi <jiangqi903@gmail.com> Cc: Hans Verkuil <hverkuil@xs4all.nl> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-03-06 08:42:58 +09:00
#include <linux/numa.h>
#include <rdma/rdma_vt.h>
#include "hfi.h"
#include "device.h"
#include "common.h"
#include "trace.h"
#include "mad.h"
#include "sdma.h"
#include "debugfs.h"
#include "verbs.h"
staging/rdma/hfi1: Add support for enabling/disabling PCIe ASPM hfi1 HW has a high PCIe ASPM L1 exit latency and also advertises an acceptable latency less than actual ASPM latencies. Additional mechanisms than those provided by BIOS/OS are therefore required to enable/disable ASPM for hfi1 to provide acceptable power/performance trade offs. This patch adds this support. By means of a module parameter ASPM can be either (a) always enabled (power save mode) (b) always disabled (performance mode) (c) enabled/disabled dynamically. The dynamic mode implements two heuristics to alleviate possible problems with high ASPM L1 exit latency. ASPM is normally enabled but is disabled if (a) there are any active user space PSM contexts, or (b) for verbs, ASPM is disabled as interrupt activity for a context starts to increase. A few more points about the verbs implementation. In order to reduce lock/cache contention between multiple verbs contexts, some processing is done at the context layer before contending for device layer locks. ASPM is disabled when two interrupts for a context happen within 1 millisec. A timer is scheduled which will re-enable ASPM after 1 second should the interrupt activity cease. Normally, every interrupt, or interrupt-pair should push the timer out further. However, since this might increase the processing load per interrupt, pushing the timer out is postponed for half a second. If after half a second we get two interrupts within 1 millisec the timer is pushed out by another second. Finally, the kernel ASPM API is not used in this patch. This is because this patch does several non-standard things as SW workarounds for HW issues. As mentioned above, it enables ASPM even when advertised actual latencies are greater than acceptable latencies. Also, whereas the kernel API only allows drivers to disable ASPM from driver probe, this patch enables/disables ASPM directly from interrupt context. Due to these reasons the kernel ASPM API was not used. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Dean Luick <dean.luick@intel.com> Reviewed-by: Ira Weiny <ira.weiny@intel.com> Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-02-04 07:33:06 +09:00
#include "aspm.h"
#include "affinity.h"
#include "vnic.h"
#include "exp_rcv.h"
#undef pr_fmt
#define pr_fmt(fmt) DRIVER_NAME ": " fmt
/*
* min buffers we want to have per context, after driver
*/
#define HFI1_MIN_USER_CTXT_BUFCNT 7
#define HFI1_MIN_HDRQ_EGRBUF_CNT 2
#define HFI1_MAX_HDRQ_EGRBUF_CNT 16352
#define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */
#define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */
#define NUM_IB_PORTS 1
/*
* Number of user receive contexts we are configured to use (to allow for more
* pio buffers per ctxt, etc.) Zero means use one user context per CPU.
*/
int num_user_contexts = -1;
module_param_named(num_user_contexts, num_user_contexts, int, 0444);
MODULE_PARM_DESC(
num_user_contexts, "Set max number of user contexts to use (default: -1 will use the real (non-HT) CPU count)");
uint krcvqs[RXE_NUM_DATA_VL];
int krcvqsset;
module_param_array(krcvqs, uint, &krcvqsset, S_IRUGO);
MODULE_PARM_DESC(krcvqs, "Array of the number of non-control kernel receive queues by VL");
/* computed based on above array */
unsigned long n_krcvqs;
static unsigned hfi1_rcvarr_split = 25;
module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO);
MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers");
static uint eager_buffer_size = (8 << 20); /* 8MB */
module_param(eager_buffer_size, uint, S_IRUGO);
MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 8MB");
static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */
module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO);
MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)");
static uint hfi1_hdrq_entsize = 32;
module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, 0444);
MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B, 32 - 128B (default)");
unsigned int user_credit_return_threshold = 33; /* default is 33% */
module_param(user_credit_return_threshold, uint, S_IRUGO);
MODULE_PARM_DESC(user_credit_return_threshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)");
static inline u64 encode_rcv_header_entry_size(u16 size);
DEFINE_XARRAY_FLAGS(hfi1_dev_table, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
static int hfi1_create_kctxt(struct hfi1_devdata *dd,
struct hfi1_pportdata *ppd)
{
struct hfi1_ctxtdata *rcd;
int ret;
/* Control context has to be always 0 */
BUILD_BUG_ON(HFI1_CTRL_CTXT != 0);
ret = hfi1_create_ctxtdata(ppd, dd->node, &rcd);
if (ret < 0) {
dd_dev_err(dd, "Kernel receive context allocation failed\n");
return ret;
}
/*
* Set up the kernel context flags here and now because they use
* default values for all receive side memories. User contexts will
* be handled as they are created.
*/
rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) |
HFI1_CAP_KGET(NODROP_RHQ_FULL) |
HFI1_CAP_KGET(NODROP_EGR_FULL) |
HFI1_CAP_KGET(DMA_RTAIL);
/* Control context must use DMA_RTAIL */
if (rcd->ctxt == HFI1_CTRL_CTXT)
rcd->flags |= HFI1_CAP_DMA_RTAIL;
rcd->seq_cnt = 1;
rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node);
if (!rcd->sc) {
dd_dev_err(dd, "Kernel send context allocation failed\n");
return -ENOMEM;
}
hfi1_init_ctxt(rcd->sc);
return 0;
}
/*
* Create the receive context array and one or more kernel contexts
*/
int hfi1_create_kctxts(struct hfi1_devdata *dd)
{
u16 i;
int ret;
dd->rcd = kcalloc_node(dd->num_rcv_contexts, sizeof(*dd->rcd),
GFP_KERNEL, dd->node);
if (!dd->rcd)
return -ENOMEM;
for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) {
ret = hfi1_create_kctxt(dd, dd->pport);
if (ret)
goto bail;
}
return 0;
bail:
IB/hfi1: Resolve kernel panics by reference counting receive contexts Base receive contexts can be used by sub contexts. Because of this, resources for the context cannot be completely freed until all sub contexts are done using the base context. Introduce a reference count so that the base receive context can be freed only when all sub contexts are done with it. Use the provided function call for setting default send context integrity rather than the manual method. The cleanup path does not set all variables back to NULL after freeing resources. Since the clean up code can get called more than once, (e.g. during context close and on the error path), it is necessary to make sure that all the variables are NULLed. Possible crash are: BUG: unable to handle kernel paging request at 0000000001908900 IP: read_csr+0x24/0x30 [hfi1] RIP: 0010:read_csr+0x24/0x30 [hfi1] Call Trace: sc_disable+0x40/0x110 [hfi1] hfi1_file_close+0x16f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 or kernel BUG at mm/slub.c:3877! RIP: 0010:kfree+0x14f/0x170 Call Trace: hfi1_free_ctxtdata+0x19a/0x2b0 [hfi1] ? hfi1_user_exp_rcv_grp_free+0x73/0x80 [hfi1] hfi1_file_close+0x20f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 Fixes: Commit 62239fc6e554 ("IB/hfi1: Clean up on context initialization failure") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-06-10 08:00:19 +09:00
for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i)
hfi1_free_ctxt(dd->rcd[i]);
IB/hfi1: Resolve kernel panics by reference counting receive contexts Base receive contexts can be used by sub contexts. Because of this, resources for the context cannot be completely freed until all sub contexts are done using the base context. Introduce a reference count so that the base receive context can be freed only when all sub contexts are done with it. Use the provided function call for setting default send context integrity rather than the manual method. The cleanup path does not set all variables back to NULL after freeing resources. Since the clean up code can get called more than once, (e.g. during context close and on the error path), it is necessary to make sure that all the variables are NULLed. Possible crash are: BUG: unable to handle kernel paging request at 0000000001908900 IP: read_csr+0x24/0x30 [hfi1] RIP: 0010:read_csr+0x24/0x30 [hfi1] Call Trace: sc_disable+0x40/0x110 [hfi1] hfi1_file_close+0x16f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 or kernel BUG at mm/slub.c:3877! RIP: 0010:kfree+0x14f/0x170 Call Trace: hfi1_free_ctxtdata+0x19a/0x2b0 [hfi1] ? hfi1_user_exp_rcv_grp_free+0x73/0x80 [hfi1] hfi1_file_close+0x20f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 Fixes: Commit 62239fc6e554 ("IB/hfi1: Clean up on context initialization failure") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-06-10 08:00:19 +09:00
/* All the contexts should be freed, free the array */
kfree(dd->rcd);
dd->rcd = NULL;
return ret;
}
IB/hfi1: Resolve kernel panics by reference counting receive contexts Base receive contexts can be used by sub contexts. Because of this, resources for the context cannot be completely freed until all sub contexts are done using the base context. Introduce a reference count so that the base receive context can be freed only when all sub contexts are done with it. Use the provided function call for setting default send context integrity rather than the manual method. The cleanup path does not set all variables back to NULL after freeing resources. Since the clean up code can get called more than once, (e.g. during context close and on the error path), it is necessary to make sure that all the variables are NULLed. Possible crash are: BUG: unable to handle kernel paging request at 0000000001908900 IP: read_csr+0x24/0x30 [hfi1] RIP: 0010:read_csr+0x24/0x30 [hfi1] Call Trace: sc_disable+0x40/0x110 [hfi1] hfi1_file_close+0x16f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 or kernel BUG at mm/slub.c:3877! RIP: 0010:kfree+0x14f/0x170 Call Trace: hfi1_free_ctxtdata+0x19a/0x2b0 [hfi1] ? hfi1_user_exp_rcv_grp_free+0x73/0x80 [hfi1] hfi1_file_close+0x20f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 Fixes: Commit 62239fc6e554 ("IB/hfi1: Clean up on context initialization failure") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-06-10 08:00:19 +09:00
/*
* Helper routines for the receive context reference count (rcd and uctxt).
IB/hfi1: Resolve kernel panics by reference counting receive contexts Base receive contexts can be used by sub contexts. Because of this, resources for the context cannot be completely freed until all sub contexts are done using the base context. Introduce a reference count so that the base receive context can be freed only when all sub contexts are done with it. Use the provided function call for setting default send context integrity rather than the manual method. The cleanup path does not set all variables back to NULL after freeing resources. Since the clean up code can get called more than once, (e.g. during context close and on the error path), it is necessary to make sure that all the variables are NULLed. Possible crash are: BUG: unable to handle kernel paging request at 0000000001908900 IP: read_csr+0x24/0x30 [hfi1] RIP: 0010:read_csr+0x24/0x30 [hfi1] Call Trace: sc_disable+0x40/0x110 [hfi1] hfi1_file_close+0x16f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 or kernel BUG at mm/slub.c:3877! RIP: 0010:kfree+0x14f/0x170 Call Trace: hfi1_free_ctxtdata+0x19a/0x2b0 [hfi1] ? hfi1_user_exp_rcv_grp_free+0x73/0x80 [hfi1] hfi1_file_close+0x20f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 Fixes: Commit 62239fc6e554 ("IB/hfi1: Clean up on context initialization failure") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-06-10 08:00:19 +09:00
*/
static void hfi1_rcd_init(struct hfi1_ctxtdata *rcd)
{
kref_init(&rcd->kref);
}
/**
* hfi1_rcd_free - When reference is zero clean up.
* @kref: pointer to an initialized rcd data structure
*
*/
IB/hfi1: Resolve kernel panics by reference counting receive contexts Base receive contexts can be used by sub contexts. Because of this, resources for the context cannot be completely freed until all sub contexts are done using the base context. Introduce a reference count so that the base receive context can be freed only when all sub contexts are done with it. Use the provided function call for setting default send context integrity rather than the manual method. The cleanup path does not set all variables back to NULL after freeing resources. Since the clean up code can get called more than once, (e.g. during context close and on the error path), it is necessary to make sure that all the variables are NULLed. Possible crash are: BUG: unable to handle kernel paging request at 0000000001908900 IP: read_csr+0x24/0x30 [hfi1] RIP: 0010:read_csr+0x24/0x30 [hfi1] Call Trace: sc_disable+0x40/0x110 [hfi1] hfi1_file_close+0x16f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 or kernel BUG at mm/slub.c:3877! RIP: 0010:kfree+0x14f/0x170 Call Trace: hfi1_free_ctxtdata+0x19a/0x2b0 [hfi1] ? hfi1_user_exp_rcv_grp_free+0x73/0x80 [hfi1] hfi1_file_close+0x20f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 Fixes: Commit 62239fc6e554 ("IB/hfi1: Clean up on context initialization failure") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-06-10 08:00:19 +09:00
static void hfi1_rcd_free(struct kref *kref)
{
unsigned long flags;
IB/hfi1: Resolve kernel panics by reference counting receive contexts Base receive contexts can be used by sub contexts. Because of this, resources for the context cannot be completely freed until all sub contexts are done using the base context. Introduce a reference count so that the base receive context can be freed only when all sub contexts are done with it. Use the provided function call for setting default send context integrity rather than the manual method. The cleanup path does not set all variables back to NULL after freeing resources. Since the clean up code can get called more than once, (e.g. during context close and on the error path), it is necessary to make sure that all the variables are NULLed. Possible crash are: BUG: unable to handle kernel paging request at 0000000001908900 IP: read_csr+0x24/0x30 [hfi1] RIP: 0010:read_csr+0x24/0x30 [hfi1] Call Trace: sc_disable+0x40/0x110 [hfi1] hfi1_file_close+0x16f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 or kernel BUG at mm/slub.c:3877! RIP: 0010:kfree+0x14f/0x170 Call Trace: hfi1_free_ctxtdata+0x19a/0x2b0 [hfi1] ? hfi1_user_exp_rcv_grp_free+0x73/0x80 [hfi1] hfi1_file_close+0x20f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 Fixes: Commit 62239fc6e554 ("IB/hfi1: Clean up on context initialization failure") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-06-10 08:00:19 +09:00
struct hfi1_ctxtdata *rcd =
container_of(kref, struct hfi1_ctxtdata, kref);
spin_lock_irqsave(&rcd->dd->uctxt_lock, flags);
rcd->dd->rcd[rcd->ctxt] = NULL;
spin_unlock_irqrestore(&rcd->dd->uctxt_lock, flags);
IB/hfi1: Close race condition on user context disable and close When disabling and removing a receive context, it is possible for an asynchronous event (i.e IRQ) to occur. Because of this, there is a race between cleaning up the context, and the context being used by the asynchronous event. cpu 0 (context cleanup) rc->ref_count-- (ref_count == 0) hfi1_rcd_free() cpu 1 (IRQ (with rcd index)) rcd_get_by_index() lock ref_count+++ <-- reference count race (WARNING) return rcd unlock cpu 0 hfi1_free_ctxtdata() <-- incorrect free location lock remove rcd from array unlock free rcd This race will cause the following WARNING trace: WARNING: CPU: 0 PID: 175027 at include/linux/kref.h:52 hfi1_rcd_get_by_index+0x84/0xa0 [hfi1] CPU: 0 PID: 175027 Comm: IMB-MPI1 Kdump: loaded Tainted: G OE ------------ 3.10.0-957.el7.x86_64 #1 Hardware name: Intel Corporation S2600KP/S2600KP, BIOS SE5C610.86B.11.01.0076.C4.111920150602 11/19/2015 Call Trace: dump_stack+0x19/0x1b __warn+0xd8/0x100 warn_slowpath_null+0x1d/0x20 hfi1_rcd_get_by_index+0x84/0xa0 [hfi1] is_rcv_urgent_int+0x24/0x90 [hfi1] general_interrupt+0x1b6/0x210 [hfi1] __handle_irq_event_percpu+0x44/0x1c0 handle_irq_event_percpu+0x32/0x80 handle_irq_event+0x3c/0x60 handle_edge_irq+0x7f/0x150 handle_irq+0xe4/0x1a0 do_IRQ+0x4d/0xf0 common_interrupt+0x162/0x162 The race can also lead to a use after free which could be similar to: general protection fault: 0000 1 SMP CPU: 71 PID: 177147 Comm: IMB-MPI1 Kdump: loaded Tainted: G W OE ------------ 3.10.0-957.el7.x86_64 #1 Hardware name: Intel Corporation S2600KP/S2600KP, BIOS SE5C610.86B.11.01.0076.C4.111920150602 11/19/2015 task: ffff9962a8098000 ti: ffff99717a508000 task.ti: ffff99717a508000 __kmalloc+0x94/0x230 Call Trace: ? hfi1_user_sdma_process_request+0x9c8/0x1250 [hfi1] hfi1_user_sdma_process_request+0x9c8/0x1250 [hfi1] hfi1_aio_write+0xba/0x110 [hfi1] do_sync_readv_writev+0x7b/0xd0 do_readv_writev+0xce/0x260 ? handle_mm_fault+0x39d/0x9b0 ? pick_next_task_fair+0x5f/0x1b0 ? sched_clock_cpu+0x85/0xc0 ? __schedule+0x13a/0x890 vfs_writev+0x35/0x60 SyS_writev+0x7f/0x110 system_call_fastpath+0x22/0x27 Use the appropriate kref API to verify access. Reorder context cleanup to ensure context removal before cleanup occurs correctly. Cc: stable@vger.kernel.org # v4.14.0+ Fixes: f683c80ca68e ("IB/hfi1: Resolve kernel panics by reference counting receive contexts") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-02-27 01:45:35 +09:00
hfi1_free_ctxtdata(rcd->dd, rcd);
IB/hfi1: Resolve kernel panics by reference counting receive contexts Base receive contexts can be used by sub contexts. Because of this, resources for the context cannot be completely freed until all sub contexts are done using the base context. Introduce a reference count so that the base receive context can be freed only when all sub contexts are done with it. Use the provided function call for setting default send context integrity rather than the manual method. The cleanup path does not set all variables back to NULL after freeing resources. Since the clean up code can get called more than once, (e.g. during context close and on the error path), it is necessary to make sure that all the variables are NULLed. Possible crash are: BUG: unable to handle kernel paging request at 0000000001908900 IP: read_csr+0x24/0x30 [hfi1] RIP: 0010:read_csr+0x24/0x30 [hfi1] Call Trace: sc_disable+0x40/0x110 [hfi1] hfi1_file_close+0x16f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 or kernel BUG at mm/slub.c:3877! RIP: 0010:kfree+0x14f/0x170 Call Trace: hfi1_free_ctxtdata+0x19a/0x2b0 [hfi1] ? hfi1_user_exp_rcv_grp_free+0x73/0x80 [hfi1] hfi1_file_close+0x20f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 Fixes: Commit 62239fc6e554 ("IB/hfi1: Clean up on context initialization failure") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-06-10 08:00:19 +09:00
kfree(rcd);
}
/**
* hfi1_rcd_put - decrement reference for rcd
* @rcd: pointer to an initialized rcd data structure
*
* Use this to put a reference after the init.
*/
IB/hfi1: Resolve kernel panics by reference counting receive contexts Base receive contexts can be used by sub contexts. Because of this, resources for the context cannot be completely freed until all sub contexts are done using the base context. Introduce a reference count so that the base receive context can be freed only when all sub contexts are done with it. Use the provided function call for setting default send context integrity rather than the manual method. The cleanup path does not set all variables back to NULL after freeing resources. Since the clean up code can get called more than once, (e.g. during context close and on the error path), it is necessary to make sure that all the variables are NULLed. Possible crash are: BUG: unable to handle kernel paging request at 0000000001908900 IP: read_csr+0x24/0x30 [hfi1] RIP: 0010:read_csr+0x24/0x30 [hfi1] Call Trace: sc_disable+0x40/0x110 [hfi1] hfi1_file_close+0x16f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 or kernel BUG at mm/slub.c:3877! RIP: 0010:kfree+0x14f/0x170 Call Trace: hfi1_free_ctxtdata+0x19a/0x2b0 [hfi1] ? hfi1_user_exp_rcv_grp_free+0x73/0x80 [hfi1] hfi1_file_close+0x20f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 Fixes: Commit 62239fc6e554 ("IB/hfi1: Clean up on context initialization failure") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-06-10 08:00:19 +09:00
int hfi1_rcd_put(struct hfi1_ctxtdata *rcd)
{
if (rcd)
return kref_put(&rcd->kref, hfi1_rcd_free);
return 0;
}
/**
* hfi1_rcd_get - increment reference for rcd
* @rcd: pointer to an initialized rcd data structure
*
* Use this to get a reference after the init.
IB/hfi1: Close race condition on user context disable and close When disabling and removing a receive context, it is possible for an asynchronous event (i.e IRQ) to occur. Because of this, there is a race between cleaning up the context, and the context being used by the asynchronous event. cpu 0 (context cleanup) rc->ref_count-- (ref_count == 0) hfi1_rcd_free() cpu 1 (IRQ (with rcd index)) rcd_get_by_index() lock ref_count+++ <-- reference count race (WARNING) return rcd unlock cpu 0 hfi1_free_ctxtdata() <-- incorrect free location lock remove rcd from array unlock free rcd This race will cause the following WARNING trace: WARNING: CPU: 0 PID: 175027 at include/linux/kref.h:52 hfi1_rcd_get_by_index+0x84/0xa0 [hfi1] CPU: 0 PID: 175027 Comm: IMB-MPI1 Kdump: loaded Tainted: G OE ------------ 3.10.0-957.el7.x86_64 #1 Hardware name: Intel Corporation S2600KP/S2600KP, BIOS SE5C610.86B.11.01.0076.C4.111920150602 11/19/2015 Call Trace: dump_stack+0x19/0x1b __warn+0xd8/0x100 warn_slowpath_null+0x1d/0x20 hfi1_rcd_get_by_index+0x84/0xa0 [hfi1] is_rcv_urgent_int+0x24/0x90 [hfi1] general_interrupt+0x1b6/0x210 [hfi1] __handle_irq_event_percpu+0x44/0x1c0 handle_irq_event_percpu+0x32/0x80 handle_irq_event+0x3c/0x60 handle_edge_irq+0x7f/0x150 handle_irq+0xe4/0x1a0 do_IRQ+0x4d/0xf0 common_interrupt+0x162/0x162 The race can also lead to a use after free which could be similar to: general protection fault: 0000 1 SMP CPU: 71 PID: 177147 Comm: IMB-MPI1 Kdump: loaded Tainted: G W OE ------------ 3.10.0-957.el7.x86_64 #1 Hardware name: Intel Corporation S2600KP/S2600KP, BIOS SE5C610.86B.11.01.0076.C4.111920150602 11/19/2015 task: ffff9962a8098000 ti: ffff99717a508000 task.ti: ffff99717a508000 __kmalloc+0x94/0x230 Call Trace: ? hfi1_user_sdma_process_request+0x9c8/0x1250 [hfi1] hfi1_user_sdma_process_request+0x9c8/0x1250 [hfi1] hfi1_aio_write+0xba/0x110 [hfi1] do_sync_readv_writev+0x7b/0xd0 do_readv_writev+0xce/0x260 ? handle_mm_fault+0x39d/0x9b0 ? pick_next_task_fair+0x5f/0x1b0 ? sched_clock_cpu+0x85/0xc0 ? __schedule+0x13a/0x890 vfs_writev+0x35/0x60 SyS_writev+0x7f/0x110 system_call_fastpath+0x22/0x27 Use the appropriate kref API to verify access. Reorder context cleanup to ensure context removal before cleanup occurs correctly. Cc: stable@vger.kernel.org # v4.14.0+ Fixes: f683c80ca68e ("IB/hfi1: Resolve kernel panics by reference counting receive contexts") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-02-27 01:45:35 +09:00
*
* Return : reflect kref_get_unless_zero(), which returns non-zero on
* increment, otherwise 0.
*/
IB/hfi1: Close race condition on user context disable and close When disabling and removing a receive context, it is possible for an asynchronous event (i.e IRQ) to occur. Because of this, there is a race between cleaning up the context, and the context being used by the asynchronous event. cpu 0 (context cleanup) rc->ref_count-- (ref_count == 0) hfi1_rcd_free() cpu 1 (IRQ (with rcd index)) rcd_get_by_index() lock ref_count+++ <-- reference count race (WARNING) return rcd unlock cpu 0 hfi1_free_ctxtdata() <-- incorrect free location lock remove rcd from array unlock free rcd This race will cause the following WARNING trace: WARNING: CPU: 0 PID: 175027 at include/linux/kref.h:52 hfi1_rcd_get_by_index+0x84/0xa0 [hfi1] CPU: 0 PID: 175027 Comm: IMB-MPI1 Kdump: loaded Tainted: G OE ------------ 3.10.0-957.el7.x86_64 #1 Hardware name: Intel Corporation S2600KP/S2600KP, BIOS SE5C610.86B.11.01.0076.C4.111920150602 11/19/2015 Call Trace: dump_stack+0x19/0x1b __warn+0xd8/0x100 warn_slowpath_null+0x1d/0x20 hfi1_rcd_get_by_index+0x84/0xa0 [hfi1] is_rcv_urgent_int+0x24/0x90 [hfi1] general_interrupt+0x1b6/0x210 [hfi1] __handle_irq_event_percpu+0x44/0x1c0 handle_irq_event_percpu+0x32/0x80 handle_irq_event+0x3c/0x60 handle_edge_irq+0x7f/0x150 handle_irq+0xe4/0x1a0 do_IRQ+0x4d/0xf0 common_interrupt+0x162/0x162 The race can also lead to a use after free which could be similar to: general protection fault: 0000 1 SMP CPU: 71 PID: 177147 Comm: IMB-MPI1 Kdump: loaded Tainted: G W OE ------------ 3.10.0-957.el7.x86_64 #1 Hardware name: Intel Corporation S2600KP/S2600KP, BIOS SE5C610.86B.11.01.0076.C4.111920150602 11/19/2015 task: ffff9962a8098000 ti: ffff99717a508000 task.ti: ffff99717a508000 __kmalloc+0x94/0x230 Call Trace: ? hfi1_user_sdma_process_request+0x9c8/0x1250 [hfi1] hfi1_user_sdma_process_request+0x9c8/0x1250 [hfi1] hfi1_aio_write+0xba/0x110 [hfi1] do_sync_readv_writev+0x7b/0xd0 do_readv_writev+0xce/0x260 ? handle_mm_fault+0x39d/0x9b0 ? pick_next_task_fair+0x5f/0x1b0 ? sched_clock_cpu+0x85/0xc0 ? __schedule+0x13a/0x890 vfs_writev+0x35/0x60 SyS_writev+0x7f/0x110 system_call_fastpath+0x22/0x27 Use the appropriate kref API to verify access. Reorder context cleanup to ensure context removal before cleanup occurs correctly. Cc: stable@vger.kernel.org # v4.14.0+ Fixes: f683c80ca68e ("IB/hfi1: Resolve kernel panics by reference counting receive contexts") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-02-27 01:45:35 +09:00
int hfi1_rcd_get(struct hfi1_ctxtdata *rcd)
IB/hfi1: Resolve kernel panics by reference counting receive contexts Base receive contexts can be used by sub contexts. Because of this, resources for the context cannot be completely freed until all sub contexts are done using the base context. Introduce a reference count so that the base receive context can be freed only when all sub contexts are done with it. Use the provided function call for setting default send context integrity rather than the manual method. The cleanup path does not set all variables back to NULL after freeing resources. Since the clean up code can get called more than once, (e.g. during context close and on the error path), it is necessary to make sure that all the variables are NULLed. Possible crash are: BUG: unable to handle kernel paging request at 0000000001908900 IP: read_csr+0x24/0x30 [hfi1] RIP: 0010:read_csr+0x24/0x30 [hfi1] Call Trace: sc_disable+0x40/0x110 [hfi1] hfi1_file_close+0x16f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 or kernel BUG at mm/slub.c:3877! RIP: 0010:kfree+0x14f/0x170 Call Trace: hfi1_free_ctxtdata+0x19a/0x2b0 [hfi1] ? hfi1_user_exp_rcv_grp_free+0x73/0x80 [hfi1] hfi1_file_close+0x20f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 Fixes: Commit 62239fc6e554 ("IB/hfi1: Clean up on context initialization failure") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-06-10 08:00:19 +09:00
{
IB/hfi1: Close race condition on user context disable and close When disabling and removing a receive context, it is possible for an asynchronous event (i.e IRQ) to occur. Because of this, there is a race between cleaning up the context, and the context being used by the asynchronous event. cpu 0 (context cleanup) rc->ref_count-- (ref_count == 0) hfi1_rcd_free() cpu 1 (IRQ (with rcd index)) rcd_get_by_index() lock ref_count+++ <-- reference count race (WARNING) return rcd unlock cpu 0 hfi1_free_ctxtdata() <-- incorrect free location lock remove rcd from array unlock free rcd This race will cause the following WARNING trace: WARNING: CPU: 0 PID: 175027 at include/linux/kref.h:52 hfi1_rcd_get_by_index+0x84/0xa0 [hfi1] CPU: 0 PID: 175027 Comm: IMB-MPI1 Kdump: loaded Tainted: G OE ------------ 3.10.0-957.el7.x86_64 #1 Hardware name: Intel Corporation S2600KP/S2600KP, BIOS SE5C610.86B.11.01.0076.C4.111920150602 11/19/2015 Call Trace: dump_stack+0x19/0x1b __warn+0xd8/0x100 warn_slowpath_null+0x1d/0x20 hfi1_rcd_get_by_index+0x84/0xa0 [hfi1] is_rcv_urgent_int+0x24/0x90 [hfi1] general_interrupt+0x1b6/0x210 [hfi1] __handle_irq_event_percpu+0x44/0x1c0 handle_irq_event_percpu+0x32/0x80 handle_irq_event+0x3c/0x60 handle_edge_irq+0x7f/0x150 handle_irq+0xe4/0x1a0 do_IRQ+0x4d/0xf0 common_interrupt+0x162/0x162 The race can also lead to a use after free which could be similar to: general protection fault: 0000 1 SMP CPU: 71 PID: 177147 Comm: IMB-MPI1 Kdump: loaded Tainted: G W OE ------------ 3.10.0-957.el7.x86_64 #1 Hardware name: Intel Corporation S2600KP/S2600KP, BIOS SE5C610.86B.11.01.0076.C4.111920150602 11/19/2015 task: ffff9962a8098000 ti: ffff99717a508000 task.ti: ffff99717a508000 __kmalloc+0x94/0x230 Call Trace: ? hfi1_user_sdma_process_request+0x9c8/0x1250 [hfi1] hfi1_user_sdma_process_request+0x9c8/0x1250 [hfi1] hfi1_aio_write+0xba/0x110 [hfi1] do_sync_readv_writev+0x7b/0xd0 do_readv_writev+0xce/0x260 ? handle_mm_fault+0x39d/0x9b0 ? pick_next_task_fair+0x5f/0x1b0 ? sched_clock_cpu+0x85/0xc0 ? __schedule+0x13a/0x890 vfs_writev+0x35/0x60 SyS_writev+0x7f/0x110 system_call_fastpath+0x22/0x27 Use the appropriate kref API to verify access. Reorder context cleanup to ensure context removal before cleanup occurs correctly. Cc: stable@vger.kernel.org # v4.14.0+ Fixes: f683c80ca68e ("IB/hfi1: Resolve kernel panics by reference counting receive contexts") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-02-27 01:45:35 +09:00
return kref_get_unless_zero(&rcd->kref);
IB/hfi1: Resolve kernel panics by reference counting receive contexts Base receive contexts can be used by sub contexts. Because of this, resources for the context cannot be completely freed until all sub contexts are done using the base context. Introduce a reference count so that the base receive context can be freed only when all sub contexts are done with it. Use the provided function call for setting default send context integrity rather than the manual method. The cleanup path does not set all variables back to NULL after freeing resources. Since the clean up code can get called more than once, (e.g. during context close and on the error path), it is necessary to make sure that all the variables are NULLed. Possible crash are: BUG: unable to handle kernel paging request at 0000000001908900 IP: read_csr+0x24/0x30 [hfi1] RIP: 0010:read_csr+0x24/0x30 [hfi1] Call Trace: sc_disable+0x40/0x110 [hfi1] hfi1_file_close+0x16f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 or kernel BUG at mm/slub.c:3877! RIP: 0010:kfree+0x14f/0x170 Call Trace: hfi1_free_ctxtdata+0x19a/0x2b0 [hfi1] ? hfi1_user_exp_rcv_grp_free+0x73/0x80 [hfi1] hfi1_file_close+0x20f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 Fixes: Commit 62239fc6e554 ("IB/hfi1: Clean up on context initialization failure") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-06-10 08:00:19 +09:00
}
/**
* allocate_rcd_index - allocate an rcd index from the rcd array
* @dd: pointer to a valid devdata structure
* @rcd: rcd data structure to assign
* @index: pointer to index that is allocated
*
* Find an empty index in the rcd array, and assign the given rcd to it.
* If the array is full, we are EBUSY.
*
*/
static int allocate_rcd_index(struct hfi1_devdata *dd,
struct hfi1_ctxtdata *rcd, u16 *index)
{
unsigned long flags;
u16 ctxt;
spin_lock_irqsave(&dd->uctxt_lock, flags);
for (ctxt = 0; ctxt < dd->num_rcv_contexts; ctxt++)
if (!dd->rcd[ctxt])
break;
if (ctxt < dd->num_rcv_contexts) {
rcd->ctxt = ctxt;
dd->rcd[ctxt] = rcd;
hfi1_rcd_init(rcd);
}
spin_unlock_irqrestore(&dd->uctxt_lock, flags);
if (ctxt >= dd->num_rcv_contexts)
return -EBUSY;
*index = ctxt;
return 0;
}
/**
* hfi1_rcd_get_by_index_safe - validate the ctxt index before accessing the
* array
* @dd: pointer to a valid devdata structure
* @ctxt: the index of an possilbe rcd
*
* This is a wrapper for hfi1_rcd_get_by_index() to validate that the given
* ctxt index is valid.
*
* The caller is responsible for making the _put().
*
*/
struct hfi1_ctxtdata *hfi1_rcd_get_by_index_safe(struct hfi1_devdata *dd,
u16 ctxt)
{
if (ctxt < dd->num_rcv_contexts)
return hfi1_rcd_get_by_index(dd, ctxt);
return NULL;
}
/**
* hfi1_rcd_get_by_index
* @dd: pointer to a valid devdata structure
* @ctxt: the index of an possilbe rcd
*
* We need to protect access to the rcd array. If access is needed to
* one or more index, get the protecting spinlock and then increment the
* kref.
*
* The caller is responsible for making the _put().
*
*/
struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt)
{
unsigned long flags;
struct hfi1_ctxtdata *rcd = NULL;
spin_lock_irqsave(&dd->uctxt_lock, flags);
if (dd->rcd[ctxt]) {
rcd = dd->rcd[ctxt];
IB/hfi1: Close race condition on user context disable and close When disabling and removing a receive context, it is possible for an asynchronous event (i.e IRQ) to occur. Because of this, there is a race between cleaning up the context, and the context being used by the asynchronous event. cpu 0 (context cleanup) rc->ref_count-- (ref_count == 0) hfi1_rcd_free() cpu 1 (IRQ (with rcd index)) rcd_get_by_index() lock ref_count+++ <-- reference count race (WARNING) return rcd unlock cpu 0 hfi1_free_ctxtdata() <-- incorrect free location lock remove rcd from array unlock free rcd This race will cause the following WARNING trace: WARNING: CPU: 0 PID: 175027 at include/linux/kref.h:52 hfi1_rcd_get_by_index+0x84/0xa0 [hfi1] CPU: 0 PID: 175027 Comm: IMB-MPI1 Kdump: loaded Tainted: G OE ------------ 3.10.0-957.el7.x86_64 #1 Hardware name: Intel Corporation S2600KP/S2600KP, BIOS SE5C610.86B.11.01.0076.C4.111920150602 11/19/2015 Call Trace: dump_stack+0x19/0x1b __warn+0xd8/0x100 warn_slowpath_null+0x1d/0x20 hfi1_rcd_get_by_index+0x84/0xa0 [hfi1] is_rcv_urgent_int+0x24/0x90 [hfi1] general_interrupt+0x1b6/0x210 [hfi1] __handle_irq_event_percpu+0x44/0x1c0 handle_irq_event_percpu+0x32/0x80 handle_irq_event+0x3c/0x60 handle_edge_irq+0x7f/0x150 handle_irq+0xe4/0x1a0 do_IRQ+0x4d/0xf0 common_interrupt+0x162/0x162 The race can also lead to a use after free which could be similar to: general protection fault: 0000 1 SMP CPU: 71 PID: 177147 Comm: IMB-MPI1 Kdump: loaded Tainted: G W OE ------------ 3.10.0-957.el7.x86_64 #1 Hardware name: Intel Corporation S2600KP/S2600KP, BIOS SE5C610.86B.11.01.0076.C4.111920150602 11/19/2015 task: ffff9962a8098000 ti: ffff99717a508000 task.ti: ffff99717a508000 __kmalloc+0x94/0x230 Call Trace: ? hfi1_user_sdma_process_request+0x9c8/0x1250 [hfi1] hfi1_user_sdma_process_request+0x9c8/0x1250 [hfi1] hfi1_aio_write+0xba/0x110 [hfi1] do_sync_readv_writev+0x7b/0xd0 do_readv_writev+0xce/0x260 ? handle_mm_fault+0x39d/0x9b0 ? pick_next_task_fair+0x5f/0x1b0 ? sched_clock_cpu+0x85/0xc0 ? __schedule+0x13a/0x890 vfs_writev+0x35/0x60 SyS_writev+0x7f/0x110 system_call_fastpath+0x22/0x27 Use the appropriate kref API to verify access. Reorder context cleanup to ensure context removal before cleanup occurs correctly. Cc: stable@vger.kernel.org # v4.14.0+ Fixes: f683c80ca68e ("IB/hfi1: Resolve kernel panics by reference counting receive contexts") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-02-27 01:45:35 +09:00
if (!hfi1_rcd_get(rcd))
rcd = NULL;
}
spin_unlock_irqrestore(&dd->uctxt_lock, flags);
return rcd;
}
/*
* Common code for user and kernel context create and setup.
* NOTE: the initial kref is done here (hf1_rcd_init()).
*/
int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
struct hfi1_ctxtdata **context)
{
struct hfi1_devdata *dd = ppd->dd;
struct hfi1_ctxtdata *rcd;
unsigned kctxt_ngroups = 0;
u32 base;
if (dd->rcv_entries.nctxt_extra >
dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt)
kctxt_ngroups = (dd->rcv_entries.nctxt_extra -
(dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt));
rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, numa);
if (rcd) {
u32 rcvtids, max_entries;
u16 ctxt;
int ret;
ret = allocate_rcd_index(dd, rcd, &ctxt);
if (ret) {
*context = NULL;
kfree(rcd);
return ret;
}
INIT_LIST_HEAD(&rcd->qp_wait_list);
hfi1_exp_tid_group_init(rcd);
rcd->ppd = ppd;
rcd->dd = dd;
rcd->numa_id = numa;
rcd->rcv_array_groups = dd->rcv_entries.ngroups;
rcd->rhf_rcv_function_map = normal_rhf_rcv_functions;
mutex_init(&rcd->exp_mutex);
spin_lock_init(&rcd->exp_lock);
INIT_LIST_HEAD(&rcd->flow_queue.queue_head);
INIT_LIST_HEAD(&rcd->rarr_queue.queue_head);
hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt);
/*
* Calculate the context's RcvArray entry starting point.
* We do this here because we have to take into account all
* the RcvArray entries that previous context would have
* taken and we have to account for any extra groups assigned
* to the static (kernel) or dynamic (vnic/user) contexts.
*/
if (ctxt < dd->first_dyn_alloc_ctxt) {
if (ctxt < kctxt_ngroups) {
base = ctxt * (dd->rcv_entries.ngroups + 1);
rcd->rcv_array_groups++;
} else {
base = kctxt_ngroups +
(ctxt * dd->rcv_entries.ngroups);
}
} else {
u16 ct = ctxt - dd->first_dyn_alloc_ctxt;
base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) +
kctxt_ngroups);
if (ct < dd->rcv_entries.nctxt_extra) {
base += ct * (dd->rcv_entries.ngroups + 1);
rcd->rcv_array_groups++;
} else {
base += dd->rcv_entries.nctxt_extra +
(ct * dd->rcv_entries.ngroups);
}
}
rcd->eager_base = base * dd->rcv_entries.group_size;
rcd->rcvhdrq_cnt = rcvhdrcnt;
rcd->rcvhdrqentsize = hfi1_hdrq_entsize;
rcd->rhf_offset =
rcd->rcvhdrqentsize - sizeof(u64) / sizeof(u32);
/*
* Simple Eager buffer allocation: we have already pre-allocated
* the number of RcvArray entry groups. Each ctxtdata structure
* holds the number of groups for that context.
*
* To follow CSR requirements and maintain cacheline alignment,
* make sure all sizes and bases are multiples of group_size.
*
* The expected entry count is what is left after assigning
* eager.
*/
max_entries = rcd->rcv_array_groups *
dd->rcv_entries.group_size;
rcvtids = ((max_entries * hfi1_rcvarr_split) / 100);
rcd->egrbufs.count = round_down(rcvtids,
dd->rcv_entries.group_size);
if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) {
dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n",
rcd->ctxt);
rcd->egrbufs.count = MAX_EAGER_ENTRIES;
}
hfi1_cdbg(PROC,
"ctxt%u: max Eager buffer RcvArray entries: %u\n",
rcd->ctxt, rcd->egrbufs.count);
/*
* Allocate array that will hold the eager buffer accounting
* data.
* This will allocate the maximum possible buffer count based
* on the value of the RcvArray split parameter.
* The resulting value will be rounded down to the closest
* multiple of dd->rcv_entries.group_size.
*/
rcd->egrbufs.buffers =
kcalloc_node(rcd->egrbufs.count,
sizeof(*rcd->egrbufs.buffers),
GFP_KERNEL, numa);
if (!rcd->egrbufs.buffers)
goto bail;
rcd->egrbufs.rcvtids =
kcalloc_node(rcd->egrbufs.count,
sizeof(*rcd->egrbufs.rcvtids),
GFP_KERNEL, numa);
if (!rcd->egrbufs.rcvtids)
goto bail;
rcd->egrbufs.size = eager_buffer_size;
/*
* The size of the buffers programmed into the RcvArray
* entries needs to be big enough to handle the highest
* MTU supported.
*/
if (rcd->egrbufs.size < hfi1_max_mtu) {
rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu);
hfi1_cdbg(PROC,
"ctxt%u: eager bufs size too small. Adjusting to %u\n",
rcd->ctxt, rcd->egrbufs.size);
}
rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE;
/* Applicable only for statically created kernel contexts */
if (ctxt < dd->first_dyn_alloc_ctxt) {
rcd->opstats = kzalloc_node(sizeof(*rcd->opstats),
GFP_KERNEL, numa);
if (!rcd->opstats)
goto bail;
/* Initialize TID flow generations for the context */
hfi1_kern_init_ctxt_generations(rcd);
}
IB/hfi1: Resolve kernel panics by reference counting receive contexts Base receive contexts can be used by sub contexts. Because of this, resources for the context cannot be completely freed until all sub contexts are done using the base context. Introduce a reference count so that the base receive context can be freed only when all sub contexts are done with it. Use the provided function call for setting default send context integrity rather than the manual method. The cleanup path does not set all variables back to NULL after freeing resources. Since the clean up code can get called more than once, (e.g. during context close and on the error path), it is necessary to make sure that all the variables are NULLed. Possible crash are: BUG: unable to handle kernel paging request at 0000000001908900 IP: read_csr+0x24/0x30 [hfi1] RIP: 0010:read_csr+0x24/0x30 [hfi1] Call Trace: sc_disable+0x40/0x110 [hfi1] hfi1_file_close+0x16f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 or kernel BUG at mm/slub.c:3877! RIP: 0010:kfree+0x14f/0x170 Call Trace: hfi1_free_ctxtdata+0x19a/0x2b0 [hfi1] ? hfi1_user_exp_rcv_grp_free+0x73/0x80 [hfi1] hfi1_file_close+0x20f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 Fixes: Commit 62239fc6e554 ("IB/hfi1: Clean up on context initialization failure") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-06-10 08:00:19 +09:00
*context = rcd;
return 0;
}
bail:
*context = NULL;
hfi1_free_ctxt(rcd);
return -ENOMEM;
}
/**
* hfi1_free_ctxt
* @rcd: pointer to an initialized rcd data structure
*
* This wrapper is the free function that matches hfi1_create_ctxtdata().
* When a context is done being used (kernel or user), this function is called
* for the "final" put to match the kref init from hf1i_create_ctxtdata().
* Other users of the context do a get/put sequence to make sure that the
* structure isn't removed while in use.
*/
void hfi1_free_ctxt(struct hfi1_ctxtdata *rcd)
{
hfi1_rcd_put(rcd);
}
/*
* Convert a receive header entry size that to the encoding used in the CSR.
*
* Return a zero if the given size is invalid.
*/
static inline u64 encode_rcv_header_entry_size(u16 size)
{
/* there are only 3 valid receive header entry sizes */
if (size == 2)
return 1;
if (size == 16)
return 2;
else if (size == 32)
return 4;
return 0; /* invalid */
}
/*
* Select the largest ccti value over all SLs to determine the intra-
* packet gap for the link.
*
* called with cca_timer_lock held (to protect access to cca_timer
* array), and rcu_read_lock() (to protect access to cc_state).
*/
void set_link_ipg(struct hfi1_pportdata *ppd)
{
struct hfi1_devdata *dd = ppd->dd;
struct cc_state *cc_state;
int i;
u16 cce, ccti_limit, max_ccti = 0;
u16 shift, mult;
u64 src;
u32 current_egress_rate; /* Mbits /sec */
u32 max_pkt_time;
/*
* max_pkt_time is the maximum packet egress time in units
* of the fabric clock period 1/(805 MHz).
*/
cc_state = get_cc_state(ppd);
if (!cc_state)
/*
* This should _never_ happen - rcu_read_lock() is held,
* and set_link_ipg() should not be called if cc_state
* is NULL.
*/
return;
for (i = 0; i < OPA_MAX_SLS; i++) {
u16 ccti = ppd->cca_timer[i].ccti;
if (ccti > max_ccti)
max_ccti = ccti;
}
ccti_limit = cc_state->cct.ccti_limit;
if (max_ccti > ccti_limit)
max_ccti = ccti_limit;
cce = cc_state->cct.entries[max_ccti].entry;
shift = (cce & 0xc000) >> 14;
mult = (cce & 0x3fff);
current_egress_rate = active_egress_rate(ppd);
max_pkt_time = egress_cycles(ppd->ibmaxlen, current_egress_rate);
src = (max_pkt_time >> shift) * mult;
src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK;
src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT;
write_csr(dd, SEND_STATIC_RATE_CONTROL, src);
}
static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
{
struct cca_timer *cca_timer;
struct hfi1_pportdata *ppd;
int sl;
u16 ccti_timer, ccti_min;
struct cc_state *cc_state;
unsigned long flags;
enum hrtimer_restart ret = HRTIMER_NORESTART;
cca_timer = container_of(t, struct cca_timer, hrtimer);
ppd = cca_timer->ppd;
sl = cca_timer->sl;
rcu_read_lock();
cc_state = get_cc_state(ppd);
if (!cc_state) {
rcu_read_unlock();
return HRTIMER_NORESTART;
}
/*
* 1) decrement ccti for SL
* 2) calculate IPG for link (set_link_ipg())
* 3) restart timer, unless ccti is at min value
*/
ccti_min = cc_state->cong_setting.entries[sl].ccti_min;
ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
spin_lock_irqsave(&ppd->cca_timer_lock, flags);
if (cca_timer->ccti > ccti_min) {
cca_timer->ccti--;
set_link_ipg(ppd);
}
if (cca_timer->ccti > ccti_min) {
unsigned long nsec = 1024 * ccti_timer;
/* ccti_timer is in units of 1.024 usec */
hrtimer_forward_now(t, ns_to_ktime(nsec));
ret = HRTIMER_RESTART;
}
spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
rcu_read_unlock();
return ret;
}
/*
* Common code for initializing the physical port structure.
*/
void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
struct hfi1_devdata *dd, u8 hw_pidx, u8 port)
{
int i;
uint default_pkey_idx;
struct cc_state *cc_state;
ppd->dd = dd;
ppd->hw_pidx = hw_pidx;
ppd->port = port; /* IB port number, not index */
ppd->prev_link_width = LINK_WIDTH_DEFAULT;
/*
* There are C_VL_COUNT number of PortVLXmitWait counters.
* Adding 1 to C_VL_COUNT to include the PortXmitWait counter.
*/
for (i = 0; i < C_VL_COUNT + 1; i++) {
ppd->port_vl_xmit_wait_last[i] = 0;
ppd->vl_xmit_flit_cnt[i] = 0;
}
default_pkey_idx = 1;
ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY;
ppd->part_enforce |= HFI1_PART_ENFORCE_IN;
IB/hfi1: Adjust pkey entry in index 0 [ Upstream commit 62004871e1fa7f9a60797595c03477af5b5ec36f ] It is possible for the primary IPoIB network device associated with any RDMA device to fail to join certain multicast groups preventing IPv6 neighbor discovery and possibly other network ULPs from working correctly. The IPv4 broadcast group is not affected as the IPoIB network device handles joining that multicast group directly. This is because the primary IPoIB network device uses the pkey at ndex 0 in the associated RDMA device's pkey table. Anytime the pkey value of index 0 changes, the primary IPoIB network device automatically modifies it's broadcast address (i.e. /sys/class/net/[ib0]/broadcast), since the broadcast address includes the pkey value, and then bounces carrier. This includes initial pkey assignment, such as when the pkey at index 0 transitions from the opa default of invalid (0x0000) to some value such as the OPA default pkey for Virtual Fabric 0: 0x8001 or when the fabric manager is restarted with a configuration change causing the pkey at index 0 to change. Many network ULPs are not sensitive to the carrier bounce and are not expecting the broadcast address to change including the linux IPv6 stack. This problem does not affect IPoIB child network devices as their pkey value is constant for all time. To mitigate this issue, change the default pkey in at index 0 to 0x8001 to cover the predominant case and avoid issues as ipoib comes up and the FM sweeps. At some point, ipoib multicast support should automatically fix non-broadcast addresses as it does with the primary broadcast address. Fixes: 7724105686e7 ("IB/hfi1: add driver files") Link: https://lore.kernel.org/r/20210715160445.142451.47651.stgit@awfm-01.cornelisnetworks.com Suggested-by: Josh Collier <josh.d.collier@intel.com> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@cornelisnetworks.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Sasha Levin <sashal@kernel.org>
2021-07-16 01:04:45 +09:00
ppd->pkeys[0] = 0x8001;
INIT_WORK(&ppd->link_vc_work, handle_verify_cap);
INIT_WORK(&ppd->link_up_work, handle_link_up);
INIT_WORK(&ppd->link_down_work, handle_link_down);
INIT_WORK(&ppd->freeze_work, handle_freeze);
INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade);
INIT_WORK(&ppd->sma_message_work, handle_sma_message);
INIT_WORK(&ppd->link_bounce_work, handle_link_bounce);
INIT_DELAYED_WORK(&ppd->start_link_work, handle_start_link);
INIT_WORK(&ppd->linkstate_active_work, receive_interrupt_work);
INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event);
mutex_init(&ppd->hls_lock);
spin_lock_init(&ppd->qsfp_info.qsfp_lock);
ppd->qsfp_info.ppd = ppd;
ppd->sm_trap_qp = 0x0;
ppd->sa_qp = 0x1;
ppd->hfi1_wq = NULL;
spin_lock_init(&ppd->cca_timer_lock);
for (i = 0; i < OPA_MAX_SLS; i++) {
hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL);
ppd->cca_timer[i].ppd = ppd;
ppd->cca_timer[i].sl = i;
ppd->cca_timer[i].ccti = 0;
ppd->cca_timer[i].hrtimer.function = cca_timer_fn;
}
ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT;
spin_lock_init(&ppd->cc_state_lock);
spin_lock_init(&ppd->cc_log_lock);
cc_state = kzalloc(sizeof(*cc_state), GFP_KERNEL);
RCU_INIT_POINTER(ppd->cc_state, cc_state);
if (!cc_state)
goto bail;
return;
bail:
dd_dev_err(dd, "Congestion Control Agent disabled for port %d\n", port);
}
/*
* Do initialization for device that is only needed on
* first detect, not on resets.
*/
static int loadtime_init(struct hfi1_devdata *dd)
{
return 0;
}
/**
* init_after_reset - re-initialize after a reset
* @dd: the hfi1_ib device
*
* sanity check at least some of the values after reset, and
* ensure no receive or transmit (explicitly, in case reset
* failed
*/
static int init_after_reset(struct hfi1_devdata *dd)
{
int i;
struct hfi1_ctxtdata *rcd;
/*
* Ensure chip does no sends or receives, tail updates, or
* pioavail updates while we re-initialize. This is mostly
* for the driver data structures, not chip registers.
*/
for (i = 0; i < dd->num_rcv_contexts; i++) {
rcd = hfi1_rcd_get_by_index(dd, i);
hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
HFI1_RCVCTRL_INTRAVAIL_DIS |
HFI1_RCVCTRL_TAILUPD_DIS, rcd);
hfi1_rcd_put(rcd);
}
pio_send_control(dd, PSC_GLOBAL_DISABLE);
for (i = 0; i < dd->num_send_contexts; i++)
sc_disable(dd->send_contexts[i].sc);
return 0;
}
static void enable_chip(struct hfi1_devdata *dd)
{
struct hfi1_ctxtdata *rcd;
u32 rcvmask;
u16 i;
/* enable PIO send */
pio_send_control(dd, PSC_GLOBAL_ENABLE);
/*
* Enable kernel ctxts' receive and receive interrupt.
* Other ctxts done as user opens and initializes them.
*/
for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) {
rcd = hfi1_rcd_get_by_index(dd, i);
if (!rcd)
continue;
rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB;
rcvmask |= HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ?
HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR))
rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_RHQ_FULL))
rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL))
rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
if (HFI1_CAP_IS_KSET(TID_RDMA))
rcvmask |= HFI1_RCVCTRL_TIDFLOW_ENB;
hfi1_rcvctrl(dd, rcvmask, rcd);
sc_enable(rcd->sc);
hfi1_rcd_put(rcd);
}
}
/**
* create_workqueues - create per port workqueues
* @dd: the hfi1_ib device
*/
static int create_workqueues(struct hfi1_devdata *dd)
{
int pidx;
struct hfi1_pportdata *ppd;
for (pidx = 0; pidx < dd->num_pports; ++pidx) {
ppd = dd->pport + pidx;
if (!ppd->hfi1_wq) {
ppd->hfi1_wq =
alloc_workqueue(
"hfi%d_%d",
WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE |
WQ_MEM_RECLAIM,
HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES,
dd->unit, pidx);
if (!ppd->hfi1_wq)
goto wq_error;
}
if (!ppd->link_wq) {
/*
* Make the link workqueue single-threaded to enforce
* serialization.
*/
ppd->link_wq =
alloc_workqueue(
"hfi_link_%d_%d",
WQ_SYSFS | WQ_MEM_RECLAIM | WQ_UNBOUND,
1, /* max_active */
dd->unit, pidx);
if (!ppd->link_wq)
goto wq_error;
}
}
return 0;
wq_error:
pr_err("alloc_workqueue failed for port %d\n", pidx + 1);
for (pidx = 0; pidx < dd->num_pports; ++pidx) {
ppd = dd->pport + pidx;
if (ppd->hfi1_wq) {
destroy_workqueue(ppd->hfi1_wq);
ppd->hfi1_wq = NULL;
}
if (ppd->link_wq) {
destroy_workqueue(ppd->link_wq);
ppd->link_wq = NULL;
}
}
return -ENOMEM;
}
IB/hfi1: Do not destroy hfi1_wq when the device is shut down commit 28b70cd9236563e1a88a6094673fef3c08db0d51 upstream. The workqueue hfi1_wq is destroyed in function shutdown_device(), which is called by either shutdown_one() or remove_one(). The function shutdown_one() is called when the kernel is rebooted while remove_one() is called when the hfi1 driver is unloaded. When the kernel is rebooted, hfi1_wq is destroyed while all qps are still active, leading to a kernel crash: BUG: unable to handle kernel NULL pointer dereference at 0000000000000102 IP: [<ffffffff94cb7b02>] __queue_work+0x32/0x3e0 PGD 0 Oops: 0000 [#1] SMP Modules linked in: dm_round_robin nvme_rdma(OE) nvme_fabrics(OE) nvme_core(OE) ib_isert iscsi_target_mod target_core_mod ib_ucm mlx4_ib iTCO_wdt iTCO_vendor_support mxm_wmi sb_edac intel_powerclamp coretemp intel_rapl iosf_mbi kvm rpcrdma sunrpc irqbypass crc32_pclmul ghash_clmulni_intel rdma_ucm aesni_intel ib_uverbs lrw gf128mul opa_vnic glue_helper ablk_helper ib_iser cryptd ib_umad rdma_cm iw_cm ses enclosure libiscsi scsi_transport_sas pcspkr joydev ib_ipoib(OE) scsi_transport_iscsi ib_cm sg ipmi_ssif mei_me lpc_ich i2c_i801 mei ioatdma ipmi_si dm_multipath ipmi_devintf ipmi_msghandler wmi acpi_pad acpi_power_meter hangcheck_timer ip_tables ext4 mbcache jbd2 mlx4_en sd_mod crc_t10dif crct10dif_generic mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm hfi1(OE) crct10dif_pclmul crct10dif_common crc32c_intel drm ahci mlx4_core libahci rdmavt(OE) igb megaraid_sas ib_core libata drm_panel_orientation_quirks ptp pps_core devlink dca i2c_algo_bit dm_mirror dm_region_hash dm_log dm_mod CPU: 19 PID: 0 Comm: swapper/19 Kdump: loaded Tainted: G OE ------------ 3.10.0-957.el7.x86_64 #1 Hardware name: Phegda X2226A/S2600CW, BIOS SE5C610.86B.01.01.0024.021320181901 02/13/2018 task: ffff8a799ba0d140 ti: ffff8a799bad8000 task.ti: ffff8a799bad8000 RIP: 0010:[<ffffffff94cb7b02>] [<ffffffff94cb7b02>] __queue_work+0x32/0x3e0 RSP: 0018:ffff8a90dde43d80 EFLAGS: 00010046 RAX: 0000000000000082 RBX: 0000000000000086 RCX: 0000000000000000 RDX: ffff8a90b924fcb8 RSI: 0000000000000000 RDI: 000000000000001b RBP: ffff8a90dde43db8 R08: ffff8a799ba0d6d8 R09: ffff8a90dde53900 R10: 0000000000000002 R11: ffff8a90dde43de8 R12: ffff8a90b924fcb8 R13: 000000000000001b R14: 0000000000000000 R15: ffff8a90d2890000 FS: 0000000000000000(0000) GS:ffff8a90dde40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000102 CR3: 0000001a70410000 CR4: 00000000001607e0 Call Trace: [<ffffffff94cb8105>] queue_work_on+0x45/0x50 [<ffffffffc03f781e>] _hfi1_schedule_send+0x6e/0xc0 [hfi1] [<ffffffffc03f78a2>] hfi1_schedule_send+0x32/0x70 [hfi1] [<ffffffffc02cf2d9>] rvt_rc_timeout+0xe9/0x130 [rdmavt] [<ffffffff94ce563a>] ? trigger_load_balance+0x6a/0x280 [<ffffffffc02cf1f0>] ? rvt_free_qpn+0x40/0x40 [rdmavt] [<ffffffff94ca7f58>] call_timer_fn+0x38/0x110 [<ffffffffc02cf1f0>] ? rvt_free_qpn+0x40/0x40 [rdmavt] [<ffffffff94caa3bd>] run_timer_softirq+0x24d/0x300 [<ffffffff94ca0f05>] __do_softirq+0xf5/0x280 [<ffffffff9537832c>] call_softirq+0x1c/0x30 [<ffffffff94c2e675>] do_softirq+0x65/0xa0 [<ffffffff94ca1285>] irq_exit+0x105/0x110 [<ffffffff953796c8>] smp_apic_timer_interrupt+0x48/0x60 [<ffffffff95375df2>] apic_timer_interrupt+0x162/0x170 <EOI> [<ffffffff951adfb7>] ? cpuidle_enter_state+0x57/0xd0 [<ffffffff951ae10e>] cpuidle_idle_call+0xde/0x230 [<ffffffff94c366de>] arch_cpu_idle+0xe/0xc0 [<ffffffff94cfc3ba>] cpu_startup_entry+0x14a/0x1e0 [<ffffffff94c57db7>] start_secondary+0x1f7/0x270 [<ffffffff94c000d5>] start_cpu+0x5/0x14 The solution is to destroy the workqueue only when the hfi1 driver is unloaded, not when the device is shut down. In addition, when the device is shut down, no more work should be scheduled on the workqueues and the workqueues are flushed. Fixes: 8d3e71136a08 ("IB/{hfi1, qib}: Add handling of kernel restart") Link: https://lore.kernel.org/r/20200623204047.107638.77646.stgit@awfm-01.aw.intel.com Cc: <stable@vger.kernel.org> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Kaike Wan <kaike.wan@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-06-24 05:40:47 +09:00
/**
* destroy_workqueues - destroy per port workqueues
* @dd: the hfi1_ib device
*/
static void destroy_workqueues(struct hfi1_devdata *dd)
{
int pidx;
struct hfi1_pportdata *ppd;
for (pidx = 0; pidx < dd->num_pports; ++pidx) {
ppd = dd->pport + pidx;
if (ppd->hfi1_wq) {
destroy_workqueue(ppd->hfi1_wq);
ppd->hfi1_wq = NULL;
}
if (ppd->link_wq) {
destroy_workqueue(ppd->link_wq);
ppd->link_wq = NULL;
}
IB/hfi1: Do not destroy hfi1_wq when the device is shut down commit 28b70cd9236563e1a88a6094673fef3c08db0d51 upstream. The workqueue hfi1_wq is destroyed in function shutdown_device(), which is called by either shutdown_one() or remove_one(). The function shutdown_one() is called when the kernel is rebooted while remove_one() is called when the hfi1 driver is unloaded. When the kernel is rebooted, hfi1_wq is destroyed while all qps are still active, leading to a kernel crash: BUG: unable to handle kernel NULL pointer dereference at 0000000000000102 IP: [<ffffffff94cb7b02>] __queue_work+0x32/0x3e0 PGD 0 Oops: 0000 [#1] SMP Modules linked in: dm_round_robin nvme_rdma(OE) nvme_fabrics(OE) nvme_core(OE) ib_isert iscsi_target_mod target_core_mod ib_ucm mlx4_ib iTCO_wdt iTCO_vendor_support mxm_wmi sb_edac intel_powerclamp coretemp intel_rapl iosf_mbi kvm rpcrdma sunrpc irqbypass crc32_pclmul ghash_clmulni_intel rdma_ucm aesni_intel ib_uverbs lrw gf128mul opa_vnic glue_helper ablk_helper ib_iser cryptd ib_umad rdma_cm iw_cm ses enclosure libiscsi scsi_transport_sas pcspkr joydev ib_ipoib(OE) scsi_transport_iscsi ib_cm sg ipmi_ssif mei_me lpc_ich i2c_i801 mei ioatdma ipmi_si dm_multipath ipmi_devintf ipmi_msghandler wmi acpi_pad acpi_power_meter hangcheck_timer ip_tables ext4 mbcache jbd2 mlx4_en sd_mod crc_t10dif crct10dif_generic mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm hfi1(OE) crct10dif_pclmul crct10dif_common crc32c_intel drm ahci mlx4_core libahci rdmavt(OE) igb megaraid_sas ib_core libata drm_panel_orientation_quirks ptp pps_core devlink dca i2c_algo_bit dm_mirror dm_region_hash dm_log dm_mod CPU: 19 PID: 0 Comm: swapper/19 Kdump: loaded Tainted: G OE ------------ 3.10.0-957.el7.x86_64 #1 Hardware name: Phegda X2226A/S2600CW, BIOS SE5C610.86B.01.01.0024.021320181901 02/13/2018 task: ffff8a799ba0d140 ti: ffff8a799bad8000 task.ti: ffff8a799bad8000 RIP: 0010:[<ffffffff94cb7b02>] [<ffffffff94cb7b02>] __queue_work+0x32/0x3e0 RSP: 0018:ffff8a90dde43d80 EFLAGS: 00010046 RAX: 0000000000000082 RBX: 0000000000000086 RCX: 0000000000000000 RDX: ffff8a90b924fcb8 RSI: 0000000000000000 RDI: 000000000000001b RBP: ffff8a90dde43db8 R08: ffff8a799ba0d6d8 R09: ffff8a90dde53900 R10: 0000000000000002 R11: ffff8a90dde43de8 R12: ffff8a90b924fcb8 R13: 000000000000001b R14: 0000000000000000 R15: ffff8a90d2890000 FS: 0000000000000000(0000) GS:ffff8a90dde40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000102 CR3: 0000001a70410000 CR4: 00000000001607e0 Call Trace: [<ffffffff94cb8105>] queue_work_on+0x45/0x50 [<ffffffffc03f781e>] _hfi1_schedule_send+0x6e/0xc0 [hfi1] [<ffffffffc03f78a2>] hfi1_schedule_send+0x32/0x70 [hfi1] [<ffffffffc02cf2d9>] rvt_rc_timeout+0xe9/0x130 [rdmavt] [<ffffffff94ce563a>] ? trigger_load_balance+0x6a/0x280 [<ffffffffc02cf1f0>] ? rvt_free_qpn+0x40/0x40 [rdmavt] [<ffffffff94ca7f58>] call_timer_fn+0x38/0x110 [<ffffffffc02cf1f0>] ? rvt_free_qpn+0x40/0x40 [rdmavt] [<ffffffff94caa3bd>] run_timer_softirq+0x24d/0x300 [<ffffffff94ca0f05>] __do_softirq+0xf5/0x280 [<ffffffff9537832c>] call_softirq+0x1c/0x30 [<ffffffff94c2e675>] do_softirq+0x65/0xa0 [<ffffffff94ca1285>] irq_exit+0x105/0x110 [<ffffffff953796c8>] smp_apic_timer_interrupt+0x48/0x60 [<ffffffff95375df2>] apic_timer_interrupt+0x162/0x170 <EOI> [<ffffffff951adfb7>] ? cpuidle_enter_state+0x57/0xd0 [<ffffffff951ae10e>] cpuidle_idle_call+0xde/0x230 [<ffffffff94c366de>] arch_cpu_idle+0xe/0xc0 [<ffffffff94cfc3ba>] cpu_startup_entry+0x14a/0x1e0 [<ffffffff94c57db7>] start_secondary+0x1f7/0x270 [<ffffffff94c000d5>] start_cpu+0x5/0x14 The solution is to destroy the workqueue only when the hfi1 driver is unloaded, not when the device is shut down. In addition, when the device is shut down, no more work should be scheduled on the workqueues and the workqueues are flushed. Fixes: 8d3e71136a08 ("IB/{hfi1, qib}: Add handling of kernel restart") Link: https://lore.kernel.org/r/20200623204047.107638.77646.stgit@awfm-01.aw.intel.com Cc: <stable@vger.kernel.org> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Kaike Wan <kaike.wan@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-06-24 05:40:47 +09:00
}
}
/**
* enable_general_intr() - Enable the IRQs that will be handled by the
* general interrupt handler.
* @dd: valid devdata
*
*/
static void enable_general_intr(struct hfi1_devdata *dd)
{
set_intr_bits(dd, CCE_ERR_INT, MISC_ERR_INT, true);
set_intr_bits(dd, PIO_ERR_INT, TXE_ERR_INT, true);
set_intr_bits(dd, IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, true);
set_intr_bits(dd, PBC_INT, GPIO_ASSERT_INT, true);
set_intr_bits(dd, TCRIT_INT, TCRIT_INT, true);
set_intr_bits(dd, IS_DC_START, IS_DC_END, true);
set_intr_bits(dd, IS_SENDCREDIT_START, IS_SENDCREDIT_END, true);
}
/**
* hfi1_init - do the actual initialization sequence on the chip
* @dd: the hfi1_ib device
* @reinit: re-initializing, so don't allocate new memory
*
* Do the actual initialization sequence on the chip. This is done
* both from the init routine called from the PCI infrastructure, and
* when we reset the chip, or detect that it was reset internally,
* or it's administratively re-enabled.
*
* Memory allocation here and in called routines is only done in
* the first case (reinit == 0). We have to be careful, because even
* without memory allocation, we need to re-write all the chip registers
* TIDs, etc. after the reset or enable has completed.
*/
int hfi1_init(struct hfi1_devdata *dd, int reinit)
{
int ret = 0, pidx, lastfail = 0;
unsigned long len;
u16 i;
struct hfi1_ctxtdata *rcd;
struct hfi1_pportdata *ppd;
/* Set up send low level handlers */
dd->process_pio_send = hfi1_verbs_send_pio;
dd->process_dma_send = hfi1_verbs_send_dma;
dd->pio_inline_send = pio_copy;
dd->process_vnic_dma_send = hfi1_vnic_send_dma;
if (is_ax(dd)) {
atomic_set(&dd->drop_packet, DROP_PACKET_ON);
dd->do_drop = 1;
} else {
atomic_set(&dd->drop_packet, DROP_PACKET_OFF);
dd->do_drop = 0;
}
/* make sure the link is not "up" */
for (pidx = 0; pidx < dd->num_pports; ++pidx) {
ppd = dd->pport + pidx;
ppd->linkup = 0;
}
if (reinit)
ret = init_after_reset(dd);
else
ret = loadtime_init(dd);
if (ret)
goto done;
/* allocate dummy tail memory for all receive contexts */
dd->rcvhdrtail_dummy_kvaddr = dma_alloc_coherent(&dd->pcidev->dev,
sizeof(u64),
&dd->rcvhdrtail_dummy_dma,
GFP_KERNEL);
if (!dd->rcvhdrtail_dummy_kvaddr) {
dd_dev_err(dd, "cannot allocate dummy tail memory\n");
ret = -ENOMEM;
goto done;
}
/* dd->rcd can be NULL if early initialization failed */
for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) {
/*
* Set up the (kernel) rcvhdr queue and egr TIDs. If doing
* re-init, the simplest way to handle this is to free
* existing, and re-allocate.
* Need to re-create rest of ctxt 0 ctxtdata as well.
*/
rcd = hfi1_rcd_get_by_index(dd, i);
if (!rcd)
continue;
rcd->do_interrupt = &handle_receive_interrupt;
lastfail = hfi1_create_rcvhdrq(dd, rcd);
if (!lastfail)
lastfail = hfi1_setup_eagerbufs(rcd);
if (!lastfail)
lastfail = hfi1_kern_exp_rcv_init(rcd, reinit);
if (lastfail) {
dd_dev_err(dd,
"failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
ret = lastfail;
}
/* enable IRQ */
hfi1_rcd_put(rcd);
}
/* Allocate enough memory for user event notification. */
len = PAGE_ALIGN(chip_rcv_contexts(dd) * HFI1_MAX_SHARED_CTXTS *
sizeof(*dd->events));
dd->events = vmalloc_user(len);
if (!dd->events)
dd_dev_err(dd, "Failed to allocate user events page\n");
/*
* Allocate a page for device and port status.
* Page will be shared amongst all user processes.
*/
dd->status = vmalloc_user(PAGE_SIZE);
if (!dd->status)
dd_dev_err(dd, "Failed to allocate dev status page\n");
for (pidx = 0; pidx < dd->num_pports; ++pidx) {
ppd = dd->pport + pidx;
if (dd->status)
/* Currently, we only have one port */
ppd->statusp = &dd->status->port;
set_mtu(ppd);
}
/* enable chip even if we have an error, so we can debug cause */
enable_chip(dd);
done:
/*
* Set status even if port serdes is not initialized
* so that diags will work.
*/
if (dd->status)
dd->status->dev |= HFI1_STATUS_CHIP_PRESENT |
HFI1_STATUS_INITTED;
if (!ret) {
/* enable all interrupts from the chip */
enable_general_intr(dd);
init_qsfp_int(dd);
/* chip is OK for user apps; mark it as initialized */
for (pidx = 0; pidx < dd->num_pports; ++pidx) {
ppd = dd->pport + pidx;
/*
* start the serdes - must be after interrupts are
* enabled so we are notified when the link goes up
*/
lastfail = bringup_serdes(ppd);
if (lastfail)
dd_dev_info(dd,
"Failed to bring up port %u\n",
ppd->port);
/*
* Set status even if port serdes is not initialized
* so that diags will work.
*/
if (ppd->statusp)
*ppd->statusp |= HFI1_STATUS_CHIP_PRESENT |
HFI1_STATUS_INITTED;
if (!ppd->link_speed_enabled)
continue;
}
}
/* if ret is non-zero, we probably should do some cleanup here... */
return ret;
}
struct hfi1_devdata *hfi1_lookup(int unit)
{
return xa_load(&hfi1_dev_table, unit);
}
/*
* Stop the timers during unit shutdown, or after an error late
* in initialization.
*/
static void stop_timers(struct hfi1_devdata *dd)
{
struct hfi1_pportdata *ppd;
int pidx;
for (pidx = 0; pidx < dd->num_pports; ++pidx) {
ppd = dd->pport + pidx;
if (ppd->led_override_timer.function) {
del_timer_sync(&ppd->led_override_timer);
atomic_set(&ppd->led_override_timer_active, 0);
}
}
}
/**
* shutdown_device - shut down a device
* @dd: the hfi1_ib device
*
* This is called to make the device quiet when we are about to
* unload the driver, and also when the device is administratively
* disabled. It does not free any data structures.
* Everything it does has to be setup again by hfi1_init(dd, 1)
*/
static void shutdown_device(struct hfi1_devdata *dd)
{
struct hfi1_pportdata *ppd;
struct hfi1_ctxtdata *rcd;
unsigned pidx;
int i;
if (dd->flags & HFI1_SHUTDOWN)
return;
dd->flags |= HFI1_SHUTDOWN;
for (pidx = 0; pidx < dd->num_pports; ++pidx) {
ppd = dd->pport + pidx;
ppd->linkup = 0;
if (ppd->statusp)
*ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
HFI1_STATUS_IB_READY);
}
dd->flags &= ~HFI1_INITTED;
/* mask and clean up interrupts */
set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false);
msix_clean_up_interrupts(dd);
for (pidx = 0; pidx < dd->num_pports; ++pidx) {
ppd = dd->pport + pidx;
for (i = 0; i < dd->num_rcv_contexts; i++) {
rcd = hfi1_rcd_get_by_index(dd, i);
hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS |
HFI1_RCVCTRL_CTXT_DIS |
HFI1_RCVCTRL_INTRAVAIL_DIS |
HFI1_RCVCTRL_PKEY_DIS |
HFI1_RCVCTRL_ONE_PKT_EGR_DIS, rcd);
hfi1_rcd_put(rcd);
}
/*
* Gracefully stop all sends allowing any in progress to
* trickle out first.
*/
for (i = 0; i < dd->num_send_contexts; i++)
sc_flush(dd->send_contexts[i].sc);
}
/*
* Enough for anything that's going to trickle out to have actually
* done so.
*/
udelay(20);
for (pidx = 0; pidx < dd->num_pports; ++pidx) {
ppd = dd->pport + pidx;
/* disable all contexts */
for (i = 0; i < dd->num_send_contexts; i++)
sc_disable(dd->send_contexts[i].sc);
/* disable the send device */
pio_send_control(dd, PSC_GLOBAL_DISABLE);
shutdown_led_override(ppd);
/*
* Clear SerdesEnable.
* We can't count on interrupts since we are stopping.
*/
hfi1_quiet_serdes(ppd);
IB/hfi1: Do not destroy hfi1_wq when the device is shut down commit 28b70cd9236563e1a88a6094673fef3c08db0d51 upstream. The workqueue hfi1_wq is destroyed in function shutdown_device(), which is called by either shutdown_one() or remove_one(). The function shutdown_one() is called when the kernel is rebooted while remove_one() is called when the hfi1 driver is unloaded. When the kernel is rebooted, hfi1_wq is destroyed while all qps are still active, leading to a kernel crash: BUG: unable to handle kernel NULL pointer dereference at 0000000000000102 IP: [<ffffffff94cb7b02>] __queue_work+0x32/0x3e0 PGD 0 Oops: 0000 [#1] SMP Modules linked in: dm_round_robin nvme_rdma(OE) nvme_fabrics(OE) nvme_core(OE) ib_isert iscsi_target_mod target_core_mod ib_ucm mlx4_ib iTCO_wdt iTCO_vendor_support mxm_wmi sb_edac intel_powerclamp coretemp intel_rapl iosf_mbi kvm rpcrdma sunrpc irqbypass crc32_pclmul ghash_clmulni_intel rdma_ucm aesni_intel ib_uverbs lrw gf128mul opa_vnic glue_helper ablk_helper ib_iser cryptd ib_umad rdma_cm iw_cm ses enclosure libiscsi scsi_transport_sas pcspkr joydev ib_ipoib(OE) scsi_transport_iscsi ib_cm sg ipmi_ssif mei_me lpc_ich i2c_i801 mei ioatdma ipmi_si dm_multipath ipmi_devintf ipmi_msghandler wmi acpi_pad acpi_power_meter hangcheck_timer ip_tables ext4 mbcache jbd2 mlx4_en sd_mod crc_t10dif crct10dif_generic mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm hfi1(OE) crct10dif_pclmul crct10dif_common crc32c_intel drm ahci mlx4_core libahci rdmavt(OE) igb megaraid_sas ib_core libata drm_panel_orientation_quirks ptp pps_core devlink dca i2c_algo_bit dm_mirror dm_region_hash dm_log dm_mod CPU: 19 PID: 0 Comm: swapper/19 Kdump: loaded Tainted: G OE ------------ 3.10.0-957.el7.x86_64 #1 Hardware name: Phegda X2226A/S2600CW, BIOS SE5C610.86B.01.01.0024.021320181901 02/13/2018 task: ffff8a799ba0d140 ti: ffff8a799bad8000 task.ti: ffff8a799bad8000 RIP: 0010:[<ffffffff94cb7b02>] [<ffffffff94cb7b02>] __queue_work+0x32/0x3e0 RSP: 0018:ffff8a90dde43d80 EFLAGS: 00010046 RAX: 0000000000000082 RBX: 0000000000000086 RCX: 0000000000000000 RDX: ffff8a90b924fcb8 RSI: 0000000000000000 RDI: 000000000000001b RBP: ffff8a90dde43db8 R08: ffff8a799ba0d6d8 R09: ffff8a90dde53900 R10: 0000000000000002 R11: ffff8a90dde43de8 R12: ffff8a90b924fcb8 R13: 000000000000001b R14: 0000000000000000 R15: ffff8a90d2890000 FS: 0000000000000000(0000) GS:ffff8a90dde40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000102 CR3: 0000001a70410000 CR4: 00000000001607e0 Call Trace: [<ffffffff94cb8105>] queue_work_on+0x45/0x50 [<ffffffffc03f781e>] _hfi1_schedule_send+0x6e/0xc0 [hfi1] [<ffffffffc03f78a2>] hfi1_schedule_send+0x32/0x70 [hfi1] [<ffffffffc02cf2d9>] rvt_rc_timeout+0xe9/0x130 [rdmavt] [<ffffffff94ce563a>] ? trigger_load_balance+0x6a/0x280 [<ffffffffc02cf1f0>] ? rvt_free_qpn+0x40/0x40 [rdmavt] [<ffffffff94ca7f58>] call_timer_fn+0x38/0x110 [<ffffffffc02cf1f0>] ? rvt_free_qpn+0x40/0x40 [rdmavt] [<ffffffff94caa3bd>] run_timer_softirq+0x24d/0x300 [<ffffffff94ca0f05>] __do_softirq+0xf5/0x280 [<ffffffff9537832c>] call_softirq+0x1c/0x30 [<ffffffff94c2e675>] do_softirq+0x65/0xa0 [<ffffffff94ca1285>] irq_exit+0x105/0x110 [<ffffffff953796c8>] smp_apic_timer_interrupt+0x48/0x60 [<ffffffff95375df2>] apic_timer_interrupt+0x162/0x170 <EOI> [<ffffffff951adfb7>] ? cpuidle_enter_state+0x57/0xd0 [<ffffffff951ae10e>] cpuidle_idle_call+0xde/0x230 [<ffffffff94c366de>] arch_cpu_idle+0xe/0xc0 [<ffffffff94cfc3ba>] cpu_startup_entry+0x14a/0x1e0 [<ffffffff94c57db7>] start_secondary+0x1f7/0x270 [<ffffffff94c000d5>] start_cpu+0x5/0x14 The solution is to destroy the workqueue only when the hfi1 driver is unloaded, not when the device is shut down. In addition, when the device is shut down, no more work should be scheduled on the workqueues and the workqueues are flushed. Fixes: 8d3e71136a08 ("IB/{hfi1, qib}: Add handling of kernel restart") Link: https://lore.kernel.org/r/20200623204047.107638.77646.stgit@awfm-01.aw.intel.com Cc: <stable@vger.kernel.org> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Kaike Wan <kaike.wan@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-06-24 05:40:47 +09:00
if (ppd->hfi1_wq)
flush_workqueue(ppd->hfi1_wq);
if (ppd->link_wq)
IB/hfi1: Do not destroy hfi1_wq when the device is shut down commit 28b70cd9236563e1a88a6094673fef3c08db0d51 upstream. The workqueue hfi1_wq is destroyed in function shutdown_device(), which is called by either shutdown_one() or remove_one(). The function shutdown_one() is called when the kernel is rebooted while remove_one() is called when the hfi1 driver is unloaded. When the kernel is rebooted, hfi1_wq is destroyed while all qps are still active, leading to a kernel crash: BUG: unable to handle kernel NULL pointer dereference at 0000000000000102 IP: [<ffffffff94cb7b02>] __queue_work+0x32/0x3e0 PGD 0 Oops: 0000 [#1] SMP Modules linked in: dm_round_robin nvme_rdma(OE) nvme_fabrics(OE) nvme_core(OE) ib_isert iscsi_target_mod target_core_mod ib_ucm mlx4_ib iTCO_wdt iTCO_vendor_support mxm_wmi sb_edac intel_powerclamp coretemp intel_rapl iosf_mbi kvm rpcrdma sunrpc irqbypass crc32_pclmul ghash_clmulni_intel rdma_ucm aesni_intel ib_uverbs lrw gf128mul opa_vnic glue_helper ablk_helper ib_iser cryptd ib_umad rdma_cm iw_cm ses enclosure libiscsi scsi_transport_sas pcspkr joydev ib_ipoib(OE) scsi_transport_iscsi ib_cm sg ipmi_ssif mei_me lpc_ich i2c_i801 mei ioatdma ipmi_si dm_multipath ipmi_devintf ipmi_msghandler wmi acpi_pad acpi_power_meter hangcheck_timer ip_tables ext4 mbcache jbd2 mlx4_en sd_mod crc_t10dif crct10dif_generic mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm hfi1(OE) crct10dif_pclmul crct10dif_common crc32c_intel drm ahci mlx4_core libahci rdmavt(OE) igb megaraid_sas ib_core libata drm_panel_orientation_quirks ptp pps_core devlink dca i2c_algo_bit dm_mirror dm_region_hash dm_log dm_mod CPU: 19 PID: 0 Comm: swapper/19 Kdump: loaded Tainted: G OE ------------ 3.10.0-957.el7.x86_64 #1 Hardware name: Phegda X2226A/S2600CW, BIOS SE5C610.86B.01.01.0024.021320181901 02/13/2018 task: ffff8a799ba0d140 ti: ffff8a799bad8000 task.ti: ffff8a799bad8000 RIP: 0010:[<ffffffff94cb7b02>] [<ffffffff94cb7b02>] __queue_work+0x32/0x3e0 RSP: 0018:ffff8a90dde43d80 EFLAGS: 00010046 RAX: 0000000000000082 RBX: 0000000000000086 RCX: 0000000000000000 RDX: ffff8a90b924fcb8 RSI: 0000000000000000 RDI: 000000000000001b RBP: ffff8a90dde43db8 R08: ffff8a799ba0d6d8 R09: ffff8a90dde53900 R10: 0000000000000002 R11: ffff8a90dde43de8 R12: ffff8a90b924fcb8 R13: 000000000000001b R14: 0000000000000000 R15: ffff8a90d2890000 FS: 0000000000000000(0000) GS:ffff8a90dde40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000102 CR3: 0000001a70410000 CR4: 00000000001607e0 Call Trace: [<ffffffff94cb8105>] queue_work_on+0x45/0x50 [<ffffffffc03f781e>] _hfi1_schedule_send+0x6e/0xc0 [hfi1] [<ffffffffc03f78a2>] hfi1_schedule_send+0x32/0x70 [hfi1] [<ffffffffc02cf2d9>] rvt_rc_timeout+0xe9/0x130 [rdmavt] [<ffffffff94ce563a>] ? trigger_load_balance+0x6a/0x280 [<ffffffffc02cf1f0>] ? rvt_free_qpn+0x40/0x40 [rdmavt] [<ffffffff94ca7f58>] call_timer_fn+0x38/0x110 [<ffffffffc02cf1f0>] ? rvt_free_qpn+0x40/0x40 [rdmavt] [<ffffffff94caa3bd>] run_timer_softirq+0x24d/0x300 [<ffffffff94ca0f05>] __do_softirq+0xf5/0x280 [<ffffffff9537832c>] call_softirq+0x1c/0x30 [<ffffffff94c2e675>] do_softirq+0x65/0xa0 [<ffffffff94ca1285>] irq_exit+0x105/0x110 [<ffffffff953796c8>] smp_apic_timer_interrupt+0x48/0x60 [<ffffffff95375df2>] apic_timer_interrupt+0x162/0x170 <EOI> [<ffffffff951adfb7>] ? cpuidle_enter_state+0x57/0xd0 [<ffffffff951ae10e>] cpuidle_idle_call+0xde/0x230 [<ffffffff94c366de>] arch_cpu_idle+0xe/0xc0 [<ffffffff94cfc3ba>] cpu_startup_entry+0x14a/0x1e0 [<ffffffff94c57db7>] start_secondary+0x1f7/0x270 [<ffffffff94c000d5>] start_cpu+0x5/0x14 The solution is to destroy the workqueue only when the hfi1 driver is unloaded, not when the device is shut down. In addition, when the device is shut down, no more work should be scheduled on the workqueues and the workqueues are flushed. Fixes: 8d3e71136a08 ("IB/{hfi1, qib}: Add handling of kernel restart") Link: https://lore.kernel.org/r/20200623204047.107638.77646.stgit@awfm-01.aw.intel.com Cc: <stable@vger.kernel.org> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Kaike Wan <kaike.wan@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-06-24 05:40:47 +09:00
flush_workqueue(ppd->link_wq);
}
sdma_exit(dd);
}
/**
* hfi1_free_ctxtdata - free a context's allocated data
* @dd: the hfi1_ib device
* @rcd: the ctxtdata structure
*
* free up any allocated data for a context
* It should never change any chip state, or global driver state.
*/
void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
{
IB/hfi1: Resolve kernel panics by reference counting receive contexts Base receive contexts can be used by sub contexts. Because of this, resources for the context cannot be completely freed until all sub contexts are done using the base context. Introduce a reference count so that the base receive context can be freed only when all sub contexts are done with it. Use the provided function call for setting default send context integrity rather than the manual method. The cleanup path does not set all variables back to NULL after freeing resources. Since the clean up code can get called more than once, (e.g. during context close and on the error path), it is necessary to make sure that all the variables are NULLed. Possible crash are: BUG: unable to handle kernel paging request at 0000000001908900 IP: read_csr+0x24/0x30 [hfi1] RIP: 0010:read_csr+0x24/0x30 [hfi1] Call Trace: sc_disable+0x40/0x110 [hfi1] hfi1_file_close+0x16f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 or kernel BUG at mm/slub.c:3877! RIP: 0010:kfree+0x14f/0x170 Call Trace: hfi1_free_ctxtdata+0x19a/0x2b0 [hfi1] ? hfi1_user_exp_rcv_grp_free+0x73/0x80 [hfi1] hfi1_file_close+0x20f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 Fixes: Commit 62239fc6e554 ("IB/hfi1: Clean up on context initialization failure") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-06-10 08:00:19 +09:00
u32 e;
if (!rcd)
return;
if (rcd->rcvhdrq) {
dma_free_coherent(&dd->pcidev->dev, rcvhdrq_size(rcd),
rcd->rcvhdrq, rcd->rcvhdrq_dma);
rcd->rcvhdrq = NULL;
if (rcd->rcvhdrtail_kvaddr) {
dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
(void *)rcd->rcvhdrtail_kvaddr,
rcd->rcvhdrqtailaddr_dma);
rcd->rcvhdrtail_kvaddr = NULL;
}
}
/* all the RcvArray entries should have been cleared by now */
kfree(rcd->egrbufs.rcvtids);
IB/hfi1: Resolve kernel panics by reference counting receive contexts Base receive contexts can be used by sub contexts. Because of this, resources for the context cannot be completely freed until all sub contexts are done using the base context. Introduce a reference count so that the base receive context can be freed only when all sub contexts are done with it. Use the provided function call for setting default send context integrity rather than the manual method. The cleanup path does not set all variables back to NULL after freeing resources. Since the clean up code can get called more than once, (e.g. during context close and on the error path), it is necessary to make sure that all the variables are NULLed. Possible crash are: BUG: unable to handle kernel paging request at 0000000001908900 IP: read_csr+0x24/0x30 [hfi1] RIP: 0010:read_csr+0x24/0x30 [hfi1] Call Trace: sc_disable+0x40/0x110 [hfi1] hfi1_file_close+0x16f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 or kernel BUG at mm/slub.c:3877! RIP: 0010:kfree+0x14f/0x170 Call Trace: hfi1_free_ctxtdata+0x19a/0x2b0 [hfi1] ? hfi1_user_exp_rcv_grp_free+0x73/0x80 [hfi1] hfi1_file_close+0x20f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 Fixes: Commit 62239fc6e554 ("IB/hfi1: Clean up on context initialization failure") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-06-10 08:00:19 +09:00
rcd->egrbufs.rcvtids = NULL;
for (e = 0; e < rcd->egrbufs.alloced; e++) {
if (rcd->egrbufs.buffers[e].dma)
dma_free_coherent(&dd->pcidev->dev,
rcd->egrbufs.buffers[e].len,
rcd->egrbufs.buffers[e].addr,
rcd->egrbufs.buffers[e].dma);
}
kfree(rcd->egrbufs.buffers);
IB/hfi1: Resolve kernel panics by reference counting receive contexts Base receive contexts can be used by sub contexts. Because of this, resources for the context cannot be completely freed until all sub contexts are done using the base context. Introduce a reference count so that the base receive context can be freed only when all sub contexts are done with it. Use the provided function call for setting default send context integrity rather than the manual method. The cleanup path does not set all variables back to NULL after freeing resources. Since the clean up code can get called more than once, (e.g. during context close and on the error path), it is necessary to make sure that all the variables are NULLed. Possible crash are: BUG: unable to handle kernel paging request at 0000000001908900 IP: read_csr+0x24/0x30 [hfi1] RIP: 0010:read_csr+0x24/0x30 [hfi1] Call Trace: sc_disable+0x40/0x110 [hfi1] hfi1_file_close+0x16f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 or kernel BUG at mm/slub.c:3877! RIP: 0010:kfree+0x14f/0x170 Call Trace: hfi1_free_ctxtdata+0x19a/0x2b0 [hfi1] ? hfi1_user_exp_rcv_grp_free+0x73/0x80 [hfi1] hfi1_file_close+0x20f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 Fixes: Commit 62239fc6e554 ("IB/hfi1: Clean up on context initialization failure") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-06-10 08:00:19 +09:00
rcd->egrbufs.alloced = 0;
rcd->egrbufs.buffers = NULL;
sc_free(rcd->sc);
IB/hfi1: Resolve kernel panics by reference counting receive contexts Base receive contexts can be used by sub contexts. Because of this, resources for the context cannot be completely freed until all sub contexts are done using the base context. Introduce a reference count so that the base receive context can be freed only when all sub contexts are done with it. Use the provided function call for setting default send context integrity rather than the manual method. The cleanup path does not set all variables back to NULL after freeing resources. Since the clean up code can get called more than once, (e.g. during context close and on the error path), it is necessary to make sure that all the variables are NULLed. Possible crash are: BUG: unable to handle kernel paging request at 0000000001908900 IP: read_csr+0x24/0x30 [hfi1] RIP: 0010:read_csr+0x24/0x30 [hfi1] Call Trace: sc_disable+0x40/0x110 [hfi1] hfi1_file_close+0x16f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 or kernel BUG at mm/slub.c:3877! RIP: 0010:kfree+0x14f/0x170 Call Trace: hfi1_free_ctxtdata+0x19a/0x2b0 [hfi1] ? hfi1_user_exp_rcv_grp_free+0x73/0x80 [hfi1] hfi1_file_close+0x20f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 Fixes: Commit 62239fc6e554 ("IB/hfi1: Clean up on context initialization failure") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-06-10 08:00:19 +09:00
rcd->sc = NULL;
vfree(rcd->subctxt_uregbase);
vfree(rcd->subctxt_rcvegrbuf);
vfree(rcd->subctxt_rcvhdr_base);
kfree(rcd->opstats);
IB/hfi1: Resolve kernel panics by reference counting receive contexts Base receive contexts can be used by sub contexts. Because of this, resources for the context cannot be completely freed until all sub contexts are done using the base context. Introduce a reference count so that the base receive context can be freed only when all sub contexts are done with it. Use the provided function call for setting default send context integrity rather than the manual method. The cleanup path does not set all variables back to NULL after freeing resources. Since the clean up code can get called more than once, (e.g. during context close and on the error path), it is necessary to make sure that all the variables are NULLed. Possible crash are: BUG: unable to handle kernel paging request at 0000000001908900 IP: read_csr+0x24/0x30 [hfi1] RIP: 0010:read_csr+0x24/0x30 [hfi1] Call Trace: sc_disable+0x40/0x110 [hfi1] hfi1_file_close+0x16f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 or kernel BUG at mm/slub.c:3877! RIP: 0010:kfree+0x14f/0x170 Call Trace: hfi1_free_ctxtdata+0x19a/0x2b0 [hfi1] ? hfi1_user_exp_rcv_grp_free+0x73/0x80 [hfi1] hfi1_file_close+0x20f/0x360 [hfi1] __fput+0xe7/0x210 ____fput+0xe/0x10 Fixes: Commit 62239fc6e554 ("IB/hfi1: Clean up on context initialization failure") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-06-10 08:00:19 +09:00
rcd->subctxt_uregbase = NULL;
rcd->subctxt_rcvegrbuf = NULL;
rcd->subctxt_rcvhdr_base = NULL;
rcd->opstats = NULL;
}
/*
* Release our hold on the shared asic data. If we are the last one,
* return the structure to be finalized outside the lock. Must be
* holding hfi1_dev_table lock.
*/
static struct hfi1_asic_data *release_asic_data(struct hfi1_devdata *dd)
{
struct hfi1_asic_data *ad;
int other;
if (!dd->asic_data)
return NULL;
dd->asic_data->dds[dd->hfi1_id] = NULL;
other = dd->hfi1_id ? 0 : 1;
ad = dd->asic_data;
dd->asic_data = NULL;
/* return NULL if the other dd still has a link */
return ad->dds[other] ? NULL : ad;
}
static void finalize_asic_data(struct hfi1_devdata *dd,
struct hfi1_asic_data *ad)
{
clean_up_i2c(dd, ad);
kfree(ad);
}
/**
* hfi1_clean_devdata - cleans up per-unit data structure
* @dd: pointer to a valid devdata structure
*
* It cleans up all data structures set up by
* by hfi1_alloc_devdata().
*/
static void hfi1_clean_devdata(struct hfi1_devdata *dd)
{
struct hfi1_asic_data *ad;
unsigned long flags;
xa_lock_irqsave(&hfi1_dev_table, flags);
__xa_erase(&hfi1_dev_table, dd->unit);
ad = release_asic_data(dd);
xa_unlock_irqrestore(&hfi1_dev_table, flags);
finalize_asic_data(dd, ad);
free_platform_config(dd);
rcu_barrier(); /* wait for rcu callbacks to complete */
free_percpu(dd->int_counter);
free_percpu(dd->rcv_limit);
free_percpu(dd->send_schedule);
free_percpu(dd->tx_opstats);
dd->int_counter = NULL;
dd->rcv_limit = NULL;
dd->send_schedule = NULL;
dd->tx_opstats = NULL;
IB/{hfi1, rdmavt, qib}: Implement CQ completion vector support Currently the driver doesn't support completion vectors. These are used to indicate which sets of CQs should be grouped together into the same vector. A vector is a CQ processing thread that runs on a specific CPU. If an application has several CQs bound to different completion vectors, and each completion vector runs on different CPUs, then the completion queue workload is balanced. This helps scale as more nodes are used. Implement CQ completion vector support using a global workqueue where a CQ entry is queued to the CPU corresponding to the CQ's completion vector. Since the workqueue is global, it's guaranteed to always be there when queueing CQ entries; Therefore, the RCU locking for cq->rdi->worker in the hot path is superfluous. Each completion vector is assigned to a different CPU. The number of completion vectors available is computed by taking the number of online, physical CPUs from the local NUMA node and subtracting the CPUs used for kernel receive queues and the general interrupt. Special use cases: * If there are no CPUs left for completion vectors, the same CPU for the general interrupt is used; Therefore, there would only be one completion vector available. * For multi-HFI systems, the number of completion vectors available for each device is the total number of completion vectors in the local NUMA node divided by the number of devices in the same NUMA node. If there's a division remainder, the first device to get initialized gets an extra completion vector. Upon a CQ creation, an invalid completion vector could be specified. Handle it as follows: * If the completion vector is less than 0, set it to 0. * Set the completion vector to the result of the passed completion vector moded with the number of device completion vectors available. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-02 22:43:55 +09:00
kfree(dd->comp_vect);
dd->comp_vect = NULL;
IB/hfi1: Fix for early release of sdma context With IRQF_SHARED flag set and CONFIG_DEBUG_SHIRQ enabled module removal may result in panic in sdma_interrupt() routine if associated sdma context was released before pci_free_irq(); [ 9198.939885] BUG: unable to handle kernel NULL pointer dereference at (null) [ 9198.940514] IP: sdma_make_progress+0xa5/0x450 [hfi1] [ 9198.941114] PGD 170bdc0067 P4D 170bdc0067 PUD 172063e067 PMD 0 [ 9198.941783] Oops: 0000 [#1] SMP ..... [ 9198.958877] CPU: 132 PID: 64173 Comm: rmmod Tainted: G OE 4.14.0-rc4+ #1 [ 9198.961032] Hardware name: Intel Corporation S7200AP/S7200AP, BIOS S72C610.86B.01.02.0118.080620171935 08/06/2017 [ 9198.963323] task: ffff9681397f0000 task.stack: ffffae1647c40000 [ 9198.965695] RIP: 0010:sdma_make_progress+0xa5/0x450 [hfi1] [ 9198.968082] RSP: 0018:ffffae1647c43be8 EFLAGS: 00010046 [ 9198.970503] RAX: 0000000000000000 RBX: ffff9680ce8b5ca8 RCX: 0000000000000000 [ 9198.973006] RDX: 0000000000000000 RSI: 0000000001a00d28 RDI: ffff9680ce8b5ca0 [ 9198.975546] RBP: ffffae1647c43c40 R08: ffff96814325ec00 R09: 00000000ffffffff [ 9198.978142] R10: 000000004325e501 R11: ffff96814325ec00 R12: ffff9680ce8b5c44 [ 9198.980779] R13: ffff9680ce8b5ca0 R14: 0000000000000000 R15: ffff9680ce8b5b00 [ 9198.983462] FS: 00007f31196ba740(0000) GS:ffff96819df00000(0000) knlGS:0000000000000000 [ 9198.986231] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 9198.989036] CR2: 0000000000000000 CR3: 000000170833f000 CR4: 00000000001406e0 [ 9198.991911] Call Trace: [ 9198.994847] sdma_engine_interrupt+0x82/0x100 [hfi1] [ 9198.997852] sdma_interrupt+0x61/0xc0 [hfi1] [ 9199.000852] __free_irq+0x1b3/0x2d0 [ 9199.003873] free_irq+0x35/0x70 [ 9199.006909] pci_free_irq+0x1c/0x30 [ 9199.009999] clean_up_interrupts+0x53/0xf0 [hfi1] [ 9199.013137] hfi1_start_cleanup+0x117/0x190 [hfi1] [ 9199.016315] postinit_cleanup+0x1d/0x270 [hfi1] [ 9199.019529] remove_one+0x1f3/0x210 [hfi1] [ 9199.022738] pci_device_remove+0x39/0xc0 [ 9199.025974] device_release_driver_internal+0x141/0x210 [ 9199.029268] driver_detach+0x3f/0x80 [ 9199.032580] bus_remove_driver+0x55/0xd0 [ 9199.035931] driver_unregister+0x2c/0x50 [ 9199.039321] pci_unregister_driver+0x2a/0xa0 [ 9199.042755] hfi1_mod_cleanup+0x10/0xb50 [hfi1] [ 9199.046196] SyS_delete_module+0x171/0x250 ... Fix by exporting sdma_clean() and removing from sdma_exit(). sdma_exit() now just manipulates the engine state, leaving the memory free to sdma_clean() which is now called just before the dd is freed. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Michael J Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Alex Estrin <alex.estrin@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-02-02 03:43:50 +09:00
sdma_clean(dd, dd->num_sdma);
rvt_dealloc_device(&dd->verbs_dev.rdi);
}
static void __hfi1_free_devdata(struct kobject *kobj)
{
struct hfi1_devdata *dd =
container_of(kobj, struct hfi1_devdata, kobj);
hfi1_clean_devdata(dd);
}
static struct kobj_type hfi1_devdata_type = {
.release = __hfi1_free_devdata,
};
void hfi1_free_devdata(struct hfi1_devdata *dd)
{
kobject_put(&dd->kobj);
}
/**
* hfi1_alloc_devdata - Allocate our primary per-unit data structure.
* @pdev: Valid PCI device
* @extra: How many bytes to alloc past the default
*
* Must be done via verbs allocator, because the verbs cleanup process
* both does cleanup and free of the data structure.
* "extra" is for chip-specific data.
*/
static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev,
size_t extra)
{
struct hfi1_devdata *dd;
int ret, nports;
/* extra is * number of ports */
nports = extra / sizeof(struct hfi1_pportdata);
dd = (struct hfi1_devdata *)rvt_alloc_device(sizeof(*dd) + extra,
nports);
if (!dd)
return ERR_PTR(-ENOMEM);
dd->num_pports = nports;
dd->pport = (struct hfi1_pportdata *)(dd + 1);
2018-05-01 21:35:58 +09:00
dd->pcidev = pdev;
pci_set_drvdata(pdev, dd);
mm: replace all open encodings for NUMA_NO_NODE Patch series "Replace all open encodings for NUMA_NO_NODE", v3. All these places for replacement were found by running the following grep patterns on the entire kernel code. Please let me know if this might have missed some instances. This might also have replaced some false positives. I will appreciate suggestions, inputs and review. 1. git grep "nid == -1" 2. git grep "node == -1" 3. git grep "nid = -1" 4. git grep "node = -1" This patch (of 2): At present there are multiple places where invalid node number is encoded as -1. Even though implicitly understood it is always better to have macros in there. Replace these open encodings for an invalid node number with the global macro NUMA_NO_NODE. This helps remove NUMA related assumptions like 'invalid node' from various places redirecting them to a common definition. Link: http://lkml.kernel.org/r/1545127933-10711-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com> Reviewed-by: David Hildenbrand <david@redhat.com> Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com> [ixgbe] Acked-by: Jens Axboe <axboe@kernel.dk> [mtip32xx] Acked-by: Vinod Koul <vkoul@kernel.org> [dmaengine.c] Acked-by: Michael Ellerman <mpe@ellerman.id.au> [powerpc] Acked-by: Doug Ledford <dledford@redhat.com> [drivers/infiniband] Cc: Joseph Qi <jiangqi903@gmail.com> Cc: Hans Verkuil <hverkuil@xs4all.nl> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-03-06 08:42:58 +09:00
dd->node = NUMA_NO_NODE;
ret = xa_alloc_irq(&hfi1_dev_table, &dd->unit, dd, xa_limit_32b,
GFP_KERNEL);
if (ret < 0) {
dev_err(&pdev->dev,
"Could not allocate unit ID: error %d\n", -ret);
goto bail;
}
rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s_%d", class_name(), dd->unit);
/*
* Initialize all locks for the device. This needs to be as early as
* possible so locks are usable.
*/
spin_lock_init(&dd->sc_lock);
spin_lock_init(&dd->sendctrl_lock);
spin_lock_init(&dd->rcvctrl_lock);
spin_lock_init(&dd->uctxt_lock);
spin_lock_init(&dd->hfi1_diag_trans_lock);
spin_lock_init(&dd->sc_init_lock);
spin_lock_init(&dd->dc8051_memlock);
seqlock_init(&dd->sc2vl_lock);
spin_lock_init(&dd->sde_map_lock);
spin_lock_init(&dd->pio_map_lock);
IB/hfi1: Fix softlockup issue Soft lockups can occur because the mad processing on different CPUs acquire the spin lock dc8051_lock: [534552.835870] [<ffffffffa026f993>] ? read_dev_port_cntr.isra.37+0x23/0x160 [hfi1] [534552.835880] [<ffffffffa02775af>] read_dev_cntr+0x4f/0x60 [hfi1] [534552.835893] [<ffffffffa028d7cd>] pma_get_opa_portstatus+0x64d/0x8c0 [hfi1] [534552.835904] [<ffffffffa0290e7d>] hfi1_process_mad+0x48d/0x18c0 [hfi1] [534552.835908] [<ffffffff811dc1f1>] ? __slab_free+0x81/0x2f0 [534552.835936] [<ffffffffa024c34e>] ? ib_mad_recv_done+0x21e/0xa30 [ib_core] [534552.835939] [<ffffffff811dd153>] ? __kmalloc+0x1f3/0x240 [534552.835947] [<ffffffffa024c3fb>] ib_mad_recv_done+0x2cb/0xa30 [ib_core] [534552.835955] [<ffffffffa0237c85>] __ib_process_cq+0x55/0xd0 [ib_core] [534552.835962] [<ffffffffa0237d70>] ib_cq_poll_work+0x20/0x60 [ib_core] [534552.835964] [<ffffffff810a7f3b>] process_one_work+0x17b/0x470 [534552.835966] [<ffffffff810a8d76>] worker_thread+0x126/0x410 [534552.835969] [<ffffffff810a8c50>] ? rescuer_thread+0x460/0x460 [534552.835971] [<ffffffff810b052f>] kthread+0xcf/0xe0 [534552.835974] [<ffffffff810b0460>] ? kthread_create_on_node+0x140/0x140 [534552.835977] [<ffffffff81696418>] ret_from_fork+0x58/0x90 [534552.835980] [<ffffffff810b0460>] ? kthread_create_on_node+0x140/0x140 This issue is made worse when the 8051 is busy and the reads take longer. Fix by using a non-spinning lock procure. Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Reviewed-by: Mike Marciszyn <mike.marciniszyn@intel.com> Signed-off-by: Tadeusz Struk <tadeusz.struk@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-04-29 02:40:02 +09:00
mutex_init(&dd->dc8051_lock);
init_waitqueue_head(&dd->event_queue);
spin_lock_init(&dd->irq_src_lock);
dd->int_counter = alloc_percpu(u64);
if (!dd->int_counter) {
ret = -ENOMEM;
goto bail;
}
dd->rcv_limit = alloc_percpu(u64);
if (!dd->rcv_limit) {
ret = -ENOMEM;
goto bail;
}
dd->send_schedule = alloc_percpu(u64);
if (!dd->send_schedule) {
ret = -ENOMEM;
goto bail;
}
dd->tx_opstats = alloc_percpu(struct hfi1_opcode_stats_perctx);
if (!dd->tx_opstats) {
ret = -ENOMEM;
goto bail;
}
IB/{hfi1, rdmavt, qib}: Implement CQ completion vector support Currently the driver doesn't support completion vectors. These are used to indicate which sets of CQs should be grouped together into the same vector. A vector is a CQ processing thread that runs on a specific CPU. If an application has several CQs bound to different completion vectors, and each completion vector runs on different CPUs, then the completion queue workload is balanced. This helps scale as more nodes are used. Implement CQ completion vector support using a global workqueue where a CQ entry is queued to the CPU corresponding to the CQ's completion vector. Since the workqueue is global, it's guaranteed to always be there when queueing CQ entries; Therefore, the RCU locking for cq->rdi->worker in the hot path is superfluous. Each completion vector is assigned to a different CPU. The number of completion vectors available is computed by taking the number of online, physical CPUs from the local NUMA node and subtracting the CPUs used for kernel receive queues and the general interrupt. Special use cases: * If there are no CPUs left for completion vectors, the same CPU for the general interrupt is used; Therefore, there would only be one completion vector available. * For multi-HFI systems, the number of completion vectors available for each device is the total number of completion vectors in the local NUMA node divided by the number of devices in the same NUMA node. If there's a division remainder, the first device to get initialized gets an extra completion vector. Upon a CQ creation, an invalid completion vector could be specified. Handle it as follows: * If the completion vector is less than 0, set it to 0. * Set the completion vector to the result of the passed completion vector moded with the number of device completion vectors available. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-02 22:43:55 +09:00
dd->comp_vect = kzalloc(sizeof(*dd->comp_vect), GFP_KERNEL);
if (!dd->comp_vect) {
ret = -ENOMEM;
goto bail;
}
kobject_init(&dd->kobj, &hfi1_devdata_type);
return dd;
bail:
hfi1_clean_devdata(dd);
return ERR_PTR(ret);
}
/*
* Called from freeze mode handlers, and from PCI error
* reporting code. Should be paranoid about state of
* system and data structures.
*/
void hfi1_disable_after_error(struct hfi1_devdata *dd)
{
if (dd->flags & HFI1_INITTED) {
u32 pidx;
dd->flags &= ~HFI1_INITTED;
if (dd->pport)
for (pidx = 0; pidx < dd->num_pports; ++pidx) {
struct hfi1_pportdata *ppd;
ppd = dd->pport + pidx;
if (dd->flags & HFI1_PRESENT)
set_link_state(ppd, HLS_DN_DISABLE);
if (ppd->statusp)
*ppd->statusp &= ~HFI1_STATUS_IB_READY;
}
}
/*
* Mark as having had an error for driver, and also
* for /sys and status word mapped to user programs.
* This marks unit as not usable, until reset.
*/
if (dd->status)
dd->status->dev |= HFI1_STATUS_HWERROR;
}
static void remove_one(struct pci_dev *);
static int init_one(struct pci_dev *, const struct pci_device_id *);
static void shutdown_one(struct pci_dev *);
#define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: "
#define PFX DRIVER_NAME ": "
const struct pci_device_id hfi1_pci_tbl[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) },
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) },
{ 0, }
};
MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl);
static struct pci_driver hfi1_pci_driver = {
.name = DRIVER_NAME,
.probe = init_one,
.remove = remove_one,
.shutdown = shutdown_one,
.id_table = hfi1_pci_tbl,
.err_handler = &hfi1_pci_err_handler,
};
static void __init compute_krcvqs(void)
{
int i;
for (i = 0; i < krcvqsset; i++)
n_krcvqs += krcvqs[i];
}
/*
* Do all the generic driver unit- and chip-independent memory
* allocation and initialization.
*/
static int __init hfi1_mod_init(void)
{
int ret;
ret = dev_init();
if (ret)
goto bail;
ret = node_affinity_init();
if (ret)
goto bail;
/* validate max MTU before any devices start */
if (!valid_opa_max_mtu(hfi1_max_mtu)) {
pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n",
hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU);
hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
}
/* valid CUs run from 1-128 in powers of 2 */
if (hfi1_cu > 128 || !is_power_of_2(hfi1_cu))
hfi1_cu = 1;
/* valid credit return threshold is 0-100, variable is unsigned */
if (user_credit_return_threshold > 100)
user_credit_return_threshold = 100;
compute_krcvqs();
/*
* sanitize receive interrupt count, time must wait until after
* the hardware type is known
*/
if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK)
rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK;
/* reject invalid combinations */
if (rcv_intr_count == 0 && rcv_intr_timeout == 0) {
pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n");
rcv_intr_count = 1;
}
if (rcv_intr_count > 1 && rcv_intr_timeout == 0) {
/*
* Avoid indefinite packet delivery by requiring a timeout
* if count is > 1.
*/
pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n");
rcv_intr_timeout = 1;
}
if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) {
/*
* The dynamic algorithm expects a non-zero timeout
* and a count > 1.
*/
pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n");
rcv_intr_dynamic = 0;
}
/* sanitize link CRC options */
link_crc_mask &= SUPPORTED_CRCS;
ret = opfn_init();
if (ret < 0) {
pr_err("Failed to allocate opfn_wq");
goto bail_dev;
}
/*
* These must be called before the driver is registered with
* the PCI subsystem.
*/
hfi1_dbg_init();
ret = pci_register_driver(&hfi1_pci_driver);
if (ret < 0) {
pr_err("Unable to register driver: error %d\n", -ret);
goto bail_dev;
}
goto bail; /* all OK */
bail_dev:
hfi1_dbg_exit();
dev_cleanup();
bail:
return ret;
}
module_init(hfi1_mod_init);
/*
* Do the non-unit driver cleanup, memory free, etc. at unload.
*/
static void __exit hfi1_mod_cleanup(void)
{
pci_unregister_driver(&hfi1_pci_driver);
opfn_exit();
IB/{hfi1, rdmavt, qib}: Implement CQ completion vector support Currently the driver doesn't support completion vectors. These are used to indicate which sets of CQs should be grouped together into the same vector. A vector is a CQ processing thread that runs on a specific CPU. If an application has several CQs bound to different completion vectors, and each completion vector runs on different CPUs, then the completion queue workload is balanced. This helps scale as more nodes are used. Implement CQ completion vector support using a global workqueue where a CQ entry is queued to the CPU corresponding to the CQ's completion vector. Since the workqueue is global, it's guaranteed to always be there when queueing CQ entries; Therefore, the RCU locking for cq->rdi->worker in the hot path is superfluous. Each completion vector is assigned to a different CPU. The number of completion vectors available is computed by taking the number of online, physical CPUs from the local NUMA node and subtracting the CPUs used for kernel receive queues and the general interrupt. Special use cases: * If there are no CPUs left for completion vectors, the same CPU for the general interrupt is used; Therefore, there would only be one completion vector available. * For multi-HFI systems, the number of completion vectors available for each device is the total number of completion vectors in the local NUMA node divided by the number of devices in the same NUMA node. If there's a division remainder, the first device to get initialized gets an extra completion vector. Upon a CQ creation, an invalid completion vector could be specified. Handle it as follows: * If the completion vector is less than 0, set it to 0. * Set the completion vector to the result of the passed completion vector moded with the number of device completion vectors available. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-02 22:43:55 +09:00
node_affinity_destroy_all();
hfi1_dbg_exit();
WARN_ON(!xa_empty(&hfi1_dev_table));
dispose_firmware(); /* asymmetric with obtain_firmware() */
dev_cleanup();
}
module_exit(hfi1_mod_cleanup);
/* this can only be called after a successful initialization */
static void cleanup_device_data(struct hfi1_devdata *dd)
{
int ctxt;
int pidx;
/* users can't do anything more with chip */
for (pidx = 0; pidx < dd->num_pports; ++pidx) {
struct hfi1_pportdata *ppd = &dd->pport[pidx];
struct cc_state *cc_state;
int i;
if (ppd->statusp)
*ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT;
for (i = 0; i < OPA_MAX_SLS; i++)
hrtimer_cancel(&ppd->cca_timer[i].hrtimer);
spin_lock(&ppd->cc_state_lock);
cc_state = get_cc_state_protected(ppd);
RCU_INIT_POINTER(ppd->cc_state, NULL);
spin_unlock(&ppd->cc_state_lock);
if (cc_state)
kfree_rcu(cc_state, rcu);
}
free_credit_return(dd);
if (dd->rcvhdrtail_dummy_kvaddr) {
dma_free_coherent(&dd->pcidev->dev, sizeof(u64),
(void *)dd->rcvhdrtail_dummy_kvaddr,
dd->rcvhdrtail_dummy_dma);
dd->rcvhdrtail_dummy_kvaddr = NULL;
}
/*
* Free any resources still in use (usually just kernel contexts)
* at unload; we do for ctxtcnt, because that's what we allocate.
*/
for (ctxt = 0; dd->rcd && ctxt < dd->num_rcv_contexts; ctxt++) {
struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
if (rcd) {
hfi1_free_ctxt_rcv_groups(rcd);
hfi1_free_ctxt(rcd);
}
}
kfree(dd->rcd);
dd->rcd = NULL;
free_pio_map(dd);
/* must follow rcv context free - need to remove rcv's hooks */
for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++)
sc_free(dd->send_contexts[ctxt].sc);
dd->num_send_contexts = 0;
kfree(dd->send_contexts);
dd->send_contexts = NULL;
kfree(dd->hw_to_sw);
dd->hw_to_sw = NULL;
kfree(dd->boardname);
vfree(dd->events);
vfree(dd->status);
}
/*
* Clean up on unit shutdown, or error during unit load after
* successful initialization.
*/
static void postinit_cleanup(struct hfi1_devdata *dd)
{
hfi1_start_cleanup(dd);
IB/{hfi1, rdmavt, qib}: Implement CQ completion vector support Currently the driver doesn't support completion vectors. These are used to indicate which sets of CQs should be grouped together into the same vector. A vector is a CQ processing thread that runs on a specific CPU. If an application has several CQs bound to different completion vectors, and each completion vector runs on different CPUs, then the completion queue workload is balanced. This helps scale as more nodes are used. Implement CQ completion vector support using a global workqueue where a CQ entry is queued to the CPU corresponding to the CQ's completion vector. Since the workqueue is global, it's guaranteed to always be there when queueing CQ entries; Therefore, the RCU locking for cq->rdi->worker in the hot path is superfluous. Each completion vector is assigned to a different CPU. The number of completion vectors available is computed by taking the number of online, physical CPUs from the local NUMA node and subtracting the CPUs used for kernel receive queues and the general interrupt. Special use cases: * If there are no CPUs left for completion vectors, the same CPU for the general interrupt is used; Therefore, there would only be one completion vector available. * For multi-HFI systems, the number of completion vectors available for each device is the total number of completion vectors in the local NUMA node divided by the number of devices in the same NUMA node. If there's a division remainder, the first device to get initialized gets an extra completion vector. Upon a CQ creation, an invalid completion vector could be specified. Handle it as follows: * If the completion vector is less than 0, set it to 0. * Set the completion vector to the result of the passed completion vector moded with the number of device completion vectors available. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-02 22:43:55 +09:00
hfi1_comp_vectors_clean_up(dd);
hfi1_dev_affinity_clean_up(dd);
hfi1_pcie_ddcleanup(dd);
hfi1_pcie_cleanup(dd->pcidev);
cleanup_device_data(dd);
hfi1_free_devdata(dd);
}
static int init_validate_rcvhdrcnt(struct hfi1_devdata *dd, uint thecnt)
{
if (thecnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) {
dd_dev_err(dd, "Receive header queue count too small\n");
return -EINVAL;
}
if (thecnt > HFI1_MAX_HDRQ_EGRBUF_CNT) {
dd_dev_err(dd,
"Receive header queue count cannot be greater than %u\n",
HFI1_MAX_HDRQ_EGRBUF_CNT);
return -EINVAL;
}
if (thecnt % HDRQ_INCREMENT) {
dd_dev_err(dd, "Receive header queue count %d must be divisible by %lu\n",
thecnt, HDRQ_INCREMENT);
return -EINVAL;
}
return 0;
}
static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
{
int ret = 0, j, pidx, initfail;
struct hfi1_devdata *dd;
struct hfi1_pportdata *ppd;
/* First, lock the non-writable module parameters */
HFI1_CAP_LOCK();
/* Validate dev ids */
if (!(ent->device == PCI_DEVICE_ID_INTEL0 ||
ent->device == PCI_DEVICE_ID_INTEL1)) {
dev_err(&pdev->dev, "Failing on unknown Intel deviceid 0x%x\n",
ent->device);
ret = -ENODEV;
goto bail;
}
/* Allocate the dd so we can get to work */
dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS *
sizeof(struct hfi1_pportdata));
if (IS_ERR(dd)) {
ret = PTR_ERR(dd);
goto bail;
}
/* Validate some global module parameters */
ret = init_validate_rcvhdrcnt(dd, rcvhdrcnt);
if (ret)
goto bail;
/* use the encoding function as a sanitization check */
if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) {
dd_dev_err(dd, "Invalid HdrQ Entry size %u\n",
hfi1_hdrq_entsize);
ret = -EINVAL;
goto bail;
}
/* The receive eager buffer size must be set before the receive
* contexts are created.
*
* Set the eager buffer size. Validate that it falls in a range
* allowed by the hardware - all powers of 2 between the min and
* max. The maximum valid MTU is within the eager buffer range
* so we do not need to cap the max_mtu by an eager buffer size
* setting.
*/
if (eager_buffer_size) {
if (!is_power_of_2(eager_buffer_size))
eager_buffer_size =
roundup_pow_of_two(eager_buffer_size);
eager_buffer_size =
clamp_val(eager_buffer_size,
MIN_EAGER_BUFFER * 8,
MAX_EAGER_BUFFER_TOTAL);
dd_dev_info(dd, "Eager buffer size %u\n",
eager_buffer_size);
} else {
dd_dev_err(dd, "Invalid Eager buffer size of 0\n");
ret = -EINVAL;
goto bail;
}
/* restrict value of hfi1_rcvarr_split */
hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100);
ret = hfi1_pcie_init(dd);
if (ret)
goto bail;
/*
* Do device-specific initialization, function table setup, dd
* allocation, etc.
*/
ret = hfi1_init_dd(dd);
if (ret)
goto clean_bail; /* error already printed */
ret = create_workqueues(dd);
if (ret)
goto clean_bail;
/* do the generic initialization */
initfail = hfi1_init(dd, 0);
/* setup vnic */
hfi1_vnic_setup(dd);
ret = hfi1_register_ib_device(dd);
/*
* Now ready for use. this should be cleared whenever we
* detect a reset, or initiate one. If earlier failure,
* we still create devices, so diags, etc. can be used
* to determine cause of problem.
*/
if (!initfail && !ret) {
dd->flags |= HFI1_INITTED;
/* create debufs files after init and ib register */
hfi1_dbg_ibdev_init(&dd->verbs_dev);
}
j = hfi1_device_create(dd);
if (j)
dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j);
if (initfail || ret) {
msix_clean_up_interrupts(dd);
stop_timers(dd);
flush_workqueue(ib_wq);
for (pidx = 0; pidx < dd->num_pports; ++pidx) {
hfi1_quiet_serdes(dd->pport + pidx);
ppd = dd->pport + pidx;
if (ppd->hfi1_wq) {
destroy_workqueue(ppd->hfi1_wq);
ppd->hfi1_wq = NULL;
}
if (ppd->link_wq) {
destroy_workqueue(ppd->link_wq);
ppd->link_wq = NULL;
}
}
if (!j)
hfi1_device_remove(dd);
if (!ret)
hfi1_unregister_ib_device(dd);
hfi1_vnic_cleanup(dd);
postinit_cleanup(dd);
if (initfail)
ret = initfail;
goto bail; /* everything already cleaned */
}
sdma_start(dd);
return 0;
clean_bail:
hfi1_pcie_cleanup(pdev);
bail:
return ret;
}
static void wait_for_clients(struct hfi1_devdata *dd)
{
/*
* Remove the device init value and complete the device if there is
* no clients or wait for active clients to finish.
*/
if (atomic_dec_and_test(&dd->user_refcount))
complete(&dd->user_comp);
wait_for_completion(&dd->user_comp);
}
static void remove_one(struct pci_dev *pdev)
{
struct hfi1_devdata *dd = pci_get_drvdata(pdev);
/* close debugfs files before ib unregister */
hfi1_dbg_ibdev_exit(&dd->verbs_dev);
/* remove the /dev hfi1 interface */
hfi1_device_remove(dd);
/* wait for existing user space clients to finish */
wait_for_clients(dd);
/* unregister from IB core */
hfi1_unregister_ib_device(dd);
/* cleanup vnic */
hfi1_vnic_cleanup(dd);
/*
* Disable the IB link, disable interrupts on the device,
* clear dma engines, etc.
*/
shutdown_device(dd);
IB/hfi1: Do not destroy hfi1_wq when the device is shut down commit 28b70cd9236563e1a88a6094673fef3c08db0d51 upstream. The workqueue hfi1_wq is destroyed in function shutdown_device(), which is called by either shutdown_one() or remove_one(). The function shutdown_one() is called when the kernel is rebooted while remove_one() is called when the hfi1 driver is unloaded. When the kernel is rebooted, hfi1_wq is destroyed while all qps are still active, leading to a kernel crash: BUG: unable to handle kernel NULL pointer dereference at 0000000000000102 IP: [<ffffffff94cb7b02>] __queue_work+0x32/0x3e0 PGD 0 Oops: 0000 [#1] SMP Modules linked in: dm_round_robin nvme_rdma(OE) nvme_fabrics(OE) nvme_core(OE) ib_isert iscsi_target_mod target_core_mod ib_ucm mlx4_ib iTCO_wdt iTCO_vendor_support mxm_wmi sb_edac intel_powerclamp coretemp intel_rapl iosf_mbi kvm rpcrdma sunrpc irqbypass crc32_pclmul ghash_clmulni_intel rdma_ucm aesni_intel ib_uverbs lrw gf128mul opa_vnic glue_helper ablk_helper ib_iser cryptd ib_umad rdma_cm iw_cm ses enclosure libiscsi scsi_transport_sas pcspkr joydev ib_ipoib(OE) scsi_transport_iscsi ib_cm sg ipmi_ssif mei_me lpc_ich i2c_i801 mei ioatdma ipmi_si dm_multipath ipmi_devintf ipmi_msghandler wmi acpi_pad acpi_power_meter hangcheck_timer ip_tables ext4 mbcache jbd2 mlx4_en sd_mod crc_t10dif crct10dif_generic mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm hfi1(OE) crct10dif_pclmul crct10dif_common crc32c_intel drm ahci mlx4_core libahci rdmavt(OE) igb megaraid_sas ib_core libata drm_panel_orientation_quirks ptp pps_core devlink dca i2c_algo_bit dm_mirror dm_region_hash dm_log dm_mod CPU: 19 PID: 0 Comm: swapper/19 Kdump: loaded Tainted: G OE ------------ 3.10.0-957.el7.x86_64 #1 Hardware name: Phegda X2226A/S2600CW, BIOS SE5C610.86B.01.01.0024.021320181901 02/13/2018 task: ffff8a799ba0d140 ti: ffff8a799bad8000 task.ti: ffff8a799bad8000 RIP: 0010:[<ffffffff94cb7b02>] [<ffffffff94cb7b02>] __queue_work+0x32/0x3e0 RSP: 0018:ffff8a90dde43d80 EFLAGS: 00010046 RAX: 0000000000000082 RBX: 0000000000000086 RCX: 0000000000000000 RDX: ffff8a90b924fcb8 RSI: 0000000000000000 RDI: 000000000000001b RBP: ffff8a90dde43db8 R08: ffff8a799ba0d6d8 R09: ffff8a90dde53900 R10: 0000000000000002 R11: ffff8a90dde43de8 R12: ffff8a90b924fcb8 R13: 000000000000001b R14: 0000000000000000 R15: ffff8a90d2890000 FS: 0000000000000000(0000) GS:ffff8a90dde40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000102 CR3: 0000001a70410000 CR4: 00000000001607e0 Call Trace: [<ffffffff94cb8105>] queue_work_on+0x45/0x50 [<ffffffffc03f781e>] _hfi1_schedule_send+0x6e/0xc0 [hfi1] [<ffffffffc03f78a2>] hfi1_schedule_send+0x32/0x70 [hfi1] [<ffffffffc02cf2d9>] rvt_rc_timeout+0xe9/0x130 [rdmavt] [<ffffffff94ce563a>] ? trigger_load_balance+0x6a/0x280 [<ffffffffc02cf1f0>] ? rvt_free_qpn+0x40/0x40 [rdmavt] [<ffffffff94ca7f58>] call_timer_fn+0x38/0x110 [<ffffffffc02cf1f0>] ? rvt_free_qpn+0x40/0x40 [rdmavt] [<ffffffff94caa3bd>] run_timer_softirq+0x24d/0x300 [<ffffffff94ca0f05>] __do_softirq+0xf5/0x280 [<ffffffff9537832c>] call_softirq+0x1c/0x30 [<ffffffff94c2e675>] do_softirq+0x65/0xa0 [<ffffffff94ca1285>] irq_exit+0x105/0x110 [<ffffffff953796c8>] smp_apic_timer_interrupt+0x48/0x60 [<ffffffff95375df2>] apic_timer_interrupt+0x162/0x170 <EOI> [<ffffffff951adfb7>] ? cpuidle_enter_state+0x57/0xd0 [<ffffffff951ae10e>] cpuidle_idle_call+0xde/0x230 [<ffffffff94c366de>] arch_cpu_idle+0xe/0xc0 [<ffffffff94cfc3ba>] cpu_startup_entry+0x14a/0x1e0 [<ffffffff94c57db7>] start_secondary+0x1f7/0x270 [<ffffffff94c000d5>] start_cpu+0x5/0x14 The solution is to destroy the workqueue only when the hfi1 driver is unloaded, not when the device is shut down. In addition, when the device is shut down, no more work should be scheduled on the workqueues and the workqueues are flushed. Fixes: 8d3e71136a08 ("IB/{hfi1, qib}: Add handling of kernel restart") Link: https://lore.kernel.org/r/20200623204047.107638.77646.stgit@awfm-01.aw.intel.com Cc: <stable@vger.kernel.org> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Kaike Wan <kaike.wan@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-06-24 05:40:47 +09:00
destroy_workqueues(dd);
stop_timers(dd);
/* wait until all of our (qsfp) queue_work() calls complete */
flush_workqueue(ib_wq);
postinit_cleanup(dd);
}
static void shutdown_one(struct pci_dev *pdev)
{
struct hfi1_devdata *dd = pci_get_drvdata(pdev);
shutdown_device(dd);
}
/**
* hfi1_create_rcvhdrq - create a receive header queue
* @dd: the hfi1_ib device
* @rcd: the context data
*
* This must be contiguous memory (from an i/o perspective), and must be
* DMA'able (which means for some systems, it will go through an IOMMU,
* or be forced into a low address range).
*/
int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
{
unsigned amt;
u64 reg;
if (!rcd->rcvhdrq) {
gfp_t gfp_flags;
amt = rcvhdrq_size(rcd);
if (rcd->ctxt < dd->first_dyn_alloc_ctxt || rcd->is_vnic)
gfp_flags = GFP_KERNEL;
else
gfp_flags = GFP_USER;
rcd->rcvhdrq = dma_alloc_coherent(&dd->pcidev->dev, amt,
&rcd->rcvhdrq_dma,
gfp_flags | __GFP_COMP);
if (!rcd->rcvhdrq) {
dd_dev_err(dd,
"attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n",
amt, rcd->ctxt);
goto bail;
}
if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ||
HFI1_CAP_UGET_MASK(rcd->flags, DMA_RTAIL)) {
rcd->rcvhdrtail_kvaddr = dma_alloc_coherent(&dd->pcidev->dev,
PAGE_SIZE,
&rcd->rcvhdrqtailaddr_dma,
gfp_flags);
if (!rcd->rcvhdrtail_kvaddr)
goto bail_free;
}
}
/*
* These values are per-context:
* RcvHdrCnt
* RcvHdrEntSize
* RcvHdrSize
*/
reg = ((u64)(rcd->rcvhdrq_cnt >> HDRQ_SIZE_SHIFT)
& RCV_HDR_CNT_CNT_MASK)
<< RCV_HDR_CNT_CNT_SHIFT;
write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_CNT, reg);
reg = (encode_rcv_header_entry_size(rcd->rcvhdrqentsize)
& RCV_HDR_ENT_SIZE_ENT_SIZE_MASK)
<< RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT;
write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_ENT_SIZE, reg);
reg = ((u64)DEFAULT_RCVHDRSIZE & RCV_HDR_SIZE_HDR_SIZE_MASK)
<< RCV_HDR_SIZE_HDR_SIZE_SHIFT;
write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_SIZE, reg);
/*
* Program dummy tail address for every receive context
* before enabling any receive context
*/
write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_TAIL_ADDR,
dd->rcvhdrtail_dummy_dma);
return 0;
bail_free:
dd_dev_err(dd,
"attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n",
rcd->ctxt);
dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq,
rcd->rcvhdrq_dma);
rcd->rcvhdrq = NULL;
bail:
return -ENOMEM;
}
/**
* allocate eager buffers, both kernel and user contexts.
* @rcd: the context we are setting up.
*
* Allocate the eager TID buffers and program them into hip.
* They are no longer completely contiguous, we do multiple allocation
* calls. Otherwise we get the OOM code involved, by asking for too
* much per call, with disastrous results on some kernels.
*/
int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd)
{
struct hfi1_devdata *dd = rcd->dd;
u32 max_entries, egrtop, alloced_bytes = 0;
gfp_t gfp_flags;
u16 order, idx = 0;
int ret = 0;
u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu);
/*
* GFP_USER, but without GFP_FS, so buffer cache can be
* coalesced (we hope); otherwise, even at order 4,
* heavy filesystem activity makes these fail, and we can
* use compound pages.
*/
gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP;
/*
* The minimum size of the eager buffers is a groups of MTU-sized
* buffers.
* The global eager_buffer_size parameter is checked against the
* theoretical lower limit of the value. Here, we check against the
* MTU.
*/
if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size))
rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size;
/*
* If using one-pkt-per-egr-buffer, lower the eager buffer
* size to the max MTU (page-aligned).
*/
if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR))
rcd->egrbufs.rcvtid_size = round_mtu;
/*
* Eager buffers sizes of 1MB or less require smaller TID sizes
* to satisfy the "multiple of 8 RcvArray entries" requirement.
*/
if (rcd->egrbufs.size <= (1 << 20))
rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu,
rounddown_pow_of_two(rcd->egrbufs.size / 8));
while (alloced_bytes < rcd->egrbufs.size &&
rcd->egrbufs.alloced < rcd->egrbufs.count) {
rcd->egrbufs.buffers[idx].addr =
dma_alloc_coherent(&dd->pcidev->dev,
rcd->egrbufs.rcvtid_size,
&rcd->egrbufs.buffers[idx].dma,
gfp_flags);
if (rcd->egrbufs.buffers[idx].addr) {
rcd->egrbufs.buffers[idx].len =
rcd->egrbufs.rcvtid_size;
rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr =
rcd->egrbufs.buffers[idx].addr;
rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].dma =
rcd->egrbufs.buffers[idx].dma;
rcd->egrbufs.alloced++;
alloced_bytes += rcd->egrbufs.rcvtid_size;
idx++;
} else {
u32 new_size, i, j;
u64 offset = 0;
/*
* Fail the eager buffer allocation if:
* - we are already using the lowest acceptable size
* - we are using one-pkt-per-egr-buffer (this implies
* that we are accepting only one size)
*/
if (rcd->egrbufs.rcvtid_size == round_mtu ||
!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) {
dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n",
rcd->ctxt);
ret = -ENOMEM;
goto bail_rcvegrbuf_phys;
}
new_size = rcd->egrbufs.rcvtid_size / 2;
/*
* If the first attempt to allocate memory failed, don't
* fail everything but continue with the next lower
* size.
*/
if (idx == 0) {
rcd->egrbufs.rcvtid_size = new_size;
continue;
}
/*
* Re-partition already allocated buffers to a smaller
* size.
*/
rcd->egrbufs.alloced = 0;
for (i = 0, j = 0, offset = 0; j < idx; i++) {
if (i >= rcd->egrbufs.count)
break;
rcd->egrbufs.rcvtids[i].dma =
rcd->egrbufs.buffers[j].dma + offset;
rcd->egrbufs.rcvtids[i].addr =
rcd->egrbufs.buffers[j].addr + offset;
rcd->egrbufs.alloced++;
if ((rcd->egrbufs.buffers[j].dma + offset +
new_size) ==
(rcd->egrbufs.buffers[j].dma +
rcd->egrbufs.buffers[j].len)) {
j++;
offset = 0;
} else {
offset += new_size;
}
}
rcd->egrbufs.rcvtid_size = new_size;
}
}
rcd->egrbufs.numbufs = idx;
rcd->egrbufs.size = alloced_bytes;
hfi1_cdbg(PROC,
"ctxt%u: Alloced %u rcv tid entries @ %uKB, total %uKB\n",
rcd->ctxt, rcd->egrbufs.alloced,
rcd->egrbufs.rcvtid_size / 1024, rcd->egrbufs.size / 1024);
/*
* Set the contexts rcv array head update threshold to the closest
* power of 2 (so we can use a mask instead of modulo) below half
* the allocated entries.
*/
rcd->egrbufs.threshold =
rounddown_pow_of_two(rcd->egrbufs.alloced / 2);
/*
* Compute the expected RcvArray entry base. This is done after
* allocating the eager buffers in order to maximize the
* expected RcvArray entries for the context.
*/
max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size;
egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size);
rcd->expected_count = max_entries - egrtop;
if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2)
rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2;
rcd->expected_base = rcd->eager_base + egrtop;
hfi1_cdbg(PROC, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u\n",
rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count,
rcd->eager_base, rcd->expected_base);
if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) {
hfi1_cdbg(PROC,
"ctxt%u: current Eager buffer size is invalid %u\n",
rcd->ctxt, rcd->egrbufs.rcvtid_size);
ret = -EINVAL;
goto bail_rcvegrbuf_phys;
}
for (idx = 0; idx < rcd->egrbufs.alloced; idx++) {
hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER,
rcd->egrbufs.rcvtids[idx].dma, order);
cond_resched();
}
return 0;
bail_rcvegrbuf_phys:
for (idx = 0; idx < rcd->egrbufs.alloced &&
rcd->egrbufs.buffers[idx].addr;
idx++) {
dma_free_coherent(&dd->pcidev->dev,
rcd->egrbufs.buffers[idx].len,
rcd->egrbufs.buffers[idx].addr,
rcd->egrbufs.buffers[idx].dma);
rcd->egrbufs.buffers[idx].addr = NULL;
rcd->egrbufs.buffers[idx].dma = 0;
rcd->egrbufs.buffers[idx].len = 0;
}
return ret;
}