From c38dbbfab1bc47b0f3a1eceea0fa45e44c477092 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 20 Jun 2019 13:07:00 -0700 Subject: [PATCH 01/42] nvme-fcloop: fix inconsistent lock state warnings With extra debug on, inconsistent lock state warnings are being called out as the tfcp_req->reqlock is being taken out without irq, while some calling sequences have the sequence in a softirq state. Change the lock taking/release to raise/drop irq. Signed-off-by: James Smart Reviewed-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 42 ++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index b8c1cc54a0db..e64969d2a7c5 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -434,7 +434,7 @@ fcloop_fcp_recv_work(struct work_struct *work) int ret = 0; bool aborted = false; - spin_lock(&tfcp_req->reqlock); + spin_lock_irq(&tfcp_req->reqlock); switch (tfcp_req->inistate) { case INI_IO_START: tfcp_req->inistate = INI_IO_ACTIVE; @@ -443,11 +443,11 @@ fcloop_fcp_recv_work(struct work_struct *work) aborted = true; break; default: - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); WARN_ON(1); return; } - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); if (unlikely(aborted)) ret = -ECANCELED; @@ -469,7 +469,7 @@ fcloop_fcp_abort_recv_work(struct work_struct *work) struct nvmefc_fcp_req *fcpreq; bool completed = false; - spin_lock(&tfcp_req->reqlock); + spin_lock_irq(&tfcp_req->reqlock); fcpreq = tfcp_req->fcpreq; switch (tfcp_req->inistate) { case INI_IO_ABORTED: @@ -478,11 +478,11 @@ fcloop_fcp_abort_recv_work(struct work_struct *work) completed = true; break; default: - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); WARN_ON(1); return; } - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); if (unlikely(completed)) { /* remove reference taken in original abort downcall */ @@ -494,9 +494,9 @@ fcloop_fcp_abort_recv_work(struct work_struct *work) nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport, &tfcp_req->tgt_fcp_req); - spin_lock(&tfcp_req->reqlock); + spin_lock_irq(&tfcp_req->reqlock); tfcp_req->fcpreq = NULL; - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED); /* call_host_done releases reference for abort downcall */ @@ -513,10 +513,10 @@ fcloop_tgt_fcprqst_done_work(struct work_struct *work) container_of(work, struct fcloop_fcpreq, tio_done_work); struct nvmefc_fcp_req *fcpreq; - spin_lock(&tfcp_req->reqlock); + spin_lock_irq(&tfcp_req->reqlock); fcpreq = tfcp_req->fcpreq; tfcp_req->inistate = INI_IO_COMPLETED; - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); fcloop_call_host_done(fcpreq, tfcp_req, tfcp_req->status); } @@ -621,12 +621,12 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport, int fcp_err = 0, active, aborted; u8 op = tgt_fcpreq->op; - spin_lock(&tfcp_req->reqlock); + spin_lock_irq(&tfcp_req->reqlock); fcpreq = tfcp_req->fcpreq; active = tfcp_req->active; aborted = tfcp_req->aborted; tfcp_req->active = true; - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); if (unlikely(active)) /* illegal - call while i/o active */ @@ -634,9 +634,9 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport, if (unlikely(aborted)) { /* target transport has aborted i/o prior */ - spin_lock(&tfcp_req->reqlock); + spin_lock_irq(&tfcp_req->reqlock); tfcp_req->active = false; - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); tgt_fcpreq->transferred_length = 0; tgt_fcpreq->fcp_error = -ECANCELED; tgt_fcpreq->done(tgt_fcpreq); @@ -693,9 +693,9 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport, break; } - spin_lock(&tfcp_req->reqlock); + spin_lock_irq(&tfcp_req->reqlock); tfcp_req->active = false; - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); tgt_fcpreq->transferred_length = xfrlen; tgt_fcpreq->fcp_error = fcp_err; @@ -715,9 +715,9 @@ fcloop_tgt_fcp_abort(struct nvmet_fc_target_port *tgtport, * (one doing io, other doing abort) and only kills ops posted * after the abort request */ - spin_lock(&tfcp_req->reqlock); + spin_lock_irq(&tfcp_req->reqlock); tfcp_req->aborted = true; - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); tfcp_req->status = NVME_SC_INTERNAL; @@ -765,7 +765,7 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport, return; /* break initiator/target relationship for io */ - spin_lock(&tfcp_req->reqlock); + spin_lock_irq(&tfcp_req->reqlock); switch (tfcp_req->inistate) { case INI_IO_START: case INI_IO_ACTIVE: @@ -775,11 +775,11 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport, abortio = false; break; default: - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); WARN_ON(1); return; } - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); if (abortio) /* leave the reference while the work item is scheduled */ From e0620bf858d3f5e7121d9e429cf7a8f04ab29bf7 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 20 Jun 2019 13:17:01 -0700 Subject: [PATCH 02/42] nvme-fcloop: resolve warnings on RCU usage and sleep warnings With additional debugging enabled, seeing warnings for suspicious RCU usage or Sleeping function called from invalid context. These both map to allocation of a work structure which is currently GFP_KERNEL, meaning it can sleep. For the RCU warning, the sequence was sleeping while holding the RCU lock. Convert the allocation to GFP_ATOMIC. Signed-off-by: James Smart Reviewed-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index e64969d2a7c5..b50b53db3746 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -535,7 +535,7 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport, if (!rport->targetport) return -ECONNREFUSED; - tfcp_req = kzalloc(sizeof(*tfcp_req), GFP_KERNEL); + tfcp_req = kzalloc(sizeof(*tfcp_req), GFP_ATOMIC); if (!tfcp_req) return -ENOMEM; From 21774222324e018f064d4fbb661e3c09c2bcaad0 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 26 Jun 2019 10:09:02 +0800 Subject: [PATCH 03/42] nvme-pci: make nvme_dev_pm_ops static Fix sparse warning: drivers/nvme/host/pci.c:2926:25: warning: symbol 'nvme_dev_pm_ops' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: YueHaibing Reviewed-by: Minwoo Im Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 189352081994..f50013369cc5 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2923,7 +2923,7 @@ static int nvme_simple_resume(struct device *dev) return 0; } -const struct dev_pm_ops nvme_dev_pm_ops = { +static const struct dev_pm_ops nvme_dev_pm_ops = { .suspend = nvme_suspend, .resume = nvme_resume, .freeze = nvme_simple_suspend, From 4fe06923f5181d57178e01add4ba54e269c59e9e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 28 Jun 2019 09:17:48 +0200 Subject: [PATCH 04/42] nvme-pci: don't fall back to a 32-bit DMA mask Since Linux 5.0 drivers can safely set the largest DMA mask supported by the device, and don't need fallbacks to work around the dma mapping implementations. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f50013369cc5..49c1fc9907a6 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2289,8 +2289,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) pci_set_master(pdev); - if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && - dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) + if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64))) goto disable; if (readl(dev->bar + NVME_REG_CSTS) == -1) { From 0298d5435276e7795b0b939d74827f6e775e7009 Mon Sep 17 00:00:00 2001 From: Alan Mikhak Date: Mon, 8 Jul 2019 10:24:12 -0700 Subject: [PATCH 05/42] nvme-pci: don't create a read hctx mapping without read queues Only request an IRQ mapping for read queues if at least one read queue is being allocted, as nvme_pci_map_queues() will later on ignore the unnecessary mapping request should nvme_dev_add() request such an IRQ mapping even though no read queues are being allocated. However, nvme_dev_add() can avoid making the request by checking the number of read queues without assuming. This would bring it more in line with nvme_setup_irqs() and nvme_calc_irq_sets(). Signed-off-by: Alan Mikhak Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 49c1fc9907a6..0423ddd97f4b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2250,7 +2250,9 @@ static int nvme_dev_add(struct nvme_dev *dev) if (!dev->ctrl.tagset) { dev->tagset.ops = &nvme_mq_ops; dev->tagset.nr_hw_queues = dev->online_queues - 1; - dev->tagset.nr_maps = 2; /* default + read */ + dev->tagset.nr_maps = 1; /* default */ + if (dev->io_queues[HCTX_TYPE_READ]) + dev->tagset.nr_maps++; if (dev->io_queues[HCTX_TYPE_POLL]) dev->tagset.nr_maps++; dev->tagset.timeout = NVME_IO_TIMEOUT; From bfac8e9f55cf62a000b643a0081488badbe92d96 Mon Sep 17 00:00:00 2001 From: Alan Mikhak Date: Mon, 8 Jul 2019 10:05:11 -0700 Subject: [PATCH 06/42] nvme-pci: check for NULL return from pci_alloc_p2pmem() Modify nvme_alloc_sq_cmds() to call pci_free_p2pmem() to free the memory it allocated using pci_alloc_p2pmem() in case pci_p2pmem_virt_to_bus() returns null. Makes sure not to call pci_free_p2pmem() if pci_alloc_p2pmem() returned NULL, which can happen if CONFIG_PCI_P2PDMA is not configured. The current implementation is not expected to leak since pci_p2pmem_virt_to_bus() is expected to fail only if pci_alloc_p2pmem() returns null. However, checking the return value of pci_alloc_p2pmem() is more explicit. Signed-off-by: Alan Mikhak Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0423ddd97f4b..ac2011b8dac1 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1439,11 +1439,15 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth)); - nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, - nvmeq->sq_cmds); - if (nvmeq->sq_dma_addr) { - set_bit(NVMEQ_SQ_CMB, &nvmeq->flags); - return 0; + if (nvmeq->sq_cmds) { + nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, + nvmeq->sq_cmds); + if (nvmeq->sq_dma_addr) { + set_bit(NVMEQ_SQ_CMB, &nvmeq->flags); + return 0; + } + + pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(depth)); } } From 7637de311bd2124b298a072852448b940d8a34b9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 Jul 2019 09:54:44 -0700 Subject: [PATCH 07/42] nvme-pci: limit max_hw_sectors based on the DMA max mapping size When running a NVMe device that is attached to a addressing challenged PCIe root port that requires bounce buffering, our request sizes can easily overflow the swiotlb bounce buffer size. Limit the maximum I/O size to the limit exposed by the DMA mapping subsystem. Signed-off-by: Christoph Hellwig Reported-by: Atish Patra Tested-by: Atish Patra Reviewed-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index ac2011b8dac1..bb970ca82517 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2503,7 +2503,8 @@ static void nvme_reset_work(struct work_struct *work) * Limit the max command size to prevent iod->sg allocations going * over a single page. */ - dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1; + dev->ctrl.max_hw_sectors = min_t(u32, + NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9); dev->ctrl.max_segments = NVME_MAX_SEGS; /* From 91f6d7985310a5dc420066004142c54da2c627d8 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 26 Jun 2019 13:43:23 +0100 Subject: [PATCH 08/42] nvme-trace: fix spelling mistake "spcecific" -> "specific" There are two spelling mistakes in trace_seq_printf messages, fix these. Signed-off-by: Colin Ian King Reviewed-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/trace.c | 2 +- drivers/nvme/target/trace.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index f01ad0fd60bb..6980ab827233 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -178,7 +178,7 @@ static const char *nvme_trace_fabrics_common(struct trace_seq *p, u8 *spc) { const char *ret = trace_seq_buffer_ptr(p); - trace_seq_printf(p, "spcecific=%*ph", 24, spc); + trace_seq_printf(p, "specific=%*ph", 24, spc); trace_seq_putc(p, 0); return ret; } diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c index cdcdd14c6408..6af11d493271 100644 --- a/drivers/nvme/target/trace.c +++ b/drivers/nvme/target/trace.c @@ -146,7 +146,7 @@ static const char *nvmet_trace_fabrics_common(struct trace_seq *p, u8 *spc) { const char *ret = trace_seq_buffer_ptr(p); - trace_seq_printf(p, "spcecific=%*ph", 24, spc); + trace_seq_printf(p, "specific=%*ph", 24, spc); trace_seq_putc(p, 0); return ret; } From 4c0181bf6cc81716102308dc47779ad1f5aeded2 Mon Sep 17 00:00:00 2001 From: Tom Wu Date: Thu, 4 Jul 2019 10:19:54 +0000 Subject: [PATCH 09/42] nvme-trace: add delete completion and submission queue to admin cmds tracer The trace log for 'delete I/O submission queue' and 'delete I/O completion queue' command will look like as below: kworker/u49:1-3438 [003] .... 6693.070865: nvme_setup_cmd: nvme0: qid=0, cmdid=11, nsid=0, flags=0x0, meta=0x0, cmd=(nvme_admin_delete_sq sqid=1) kworker/u49:1-3438 [003] .... 6693.071171: nvme_setup_cmd: nvme0: qid=0, cmdid=8, nsid=0, flags=0x0, meta=0x0, cmd=(nvme_admin_delete_cq cqid=24) Signed-off-by: Tom Wu Reviewed-by: Max Gurtovoy Reviewed-by: Minwoo Im Reviewed-by: Israel Rukshin Signed-off-by: Christoph Hellwig --- drivers/nvme/host/trace.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 6980ab827233..9778eb0406b3 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -7,6 +7,17 @@ #include #include "trace.h" +static const char *nvme_trace_delete_sq(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u16 sqid = get_unaligned_le16(cdw10); + + trace_seq_printf(p, "sqid=%u", sqid); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); @@ -23,6 +34,17 @@ static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10) return ret; } +static const char *nvme_trace_delete_cq(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u16 cqid = get_unaligned_le16(cdw10); + + trace_seq_printf(p, "cqid=%u", cqid); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvme_trace_create_cq(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); @@ -107,8 +129,12 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, u8 *cdw10) { switch (opcode) { + case nvme_admin_delete_sq: + return nvme_trace_delete_sq(p, cdw10); case nvme_admin_create_sq: return nvme_trace_create_sq(p, cdw10); + case nvme_admin_delete_cq: + return nvme_trace_delete_cq(p, cdw10); case nvme_admin_create_cq: return nvme_trace_create_cq(p, cdw10); case nvme_admin_identify: From 9d05a96e298aadb36e3ec971fab8d416e6fb7331 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 28 Jun 2019 09:53:30 -0700 Subject: [PATCH 10/42] nvmet: export I/O characteristics attributes in Identify Make the NVMe NAWUN, NAWUPF, NACWU, NPWG, NPWA, NPDG and NOWS attributes available to initator systems for the block backend. Signed-off-by: Bart Van Assche Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 3 +++ drivers/nvme/target/io-cmd-bdev.c | 39 +++++++++++++++++++++++++++++++ drivers/nvme/target/nvmet.h | 8 +++++++ 3 files changed, 50 insertions(+) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 9f72d515fc4b..4dc12ea52f23 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -442,6 +442,9 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) break; } + if (ns->bdev) + nvmet_bdev_set_limits(ns->bdev, id); + /* * We just provide a single LBA format that matches what the * underlying device reports. diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 7a1cf6437a6a..de0bff70ebb6 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -8,6 +8,45 @@ #include #include "nvmet.h" +void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) +{ + const struct queue_limits *ql = &bdev_get_queue(bdev)->limits; + /* Number of physical blocks per logical block. */ + const u32 ppl = ql->physical_block_size / ql->logical_block_size; + /* Physical blocks per logical block, 0's based. */ + const __le16 ppl0b = to0based(ppl); + + /* + * For NVMe 1.2 and later, bit 1 indicates that the fields NAWUN, + * NAWUPF, and NACWU are defined for this namespace and should be + * used by the host for this namespace instead of the AWUN, AWUPF, + * and ACWU fields in the Identify Controller data structure. If + * any of these fields are zero that means that the corresponding + * field from the identify controller data structure should be used. + */ + id->nsfeat |= 1 << 1; + id->nawun = ppl0b; + id->nawupf = ppl0b; + id->nacwu = ppl0b; + + /* + * Bit 4 indicates that the fields NPWG, NPWA, NPDG, NPDA, and + * NOWS are defined for this namespace and should be used by + * the host for I/O optimization. + */ + id->nsfeat |= 1 << 4; + /* NPWG = Namespace Preferred Write Granularity. 0's based */ + id->npwg = ppl0b; + /* NPWA = Namespace Preferred Write Alignment. 0's based */ + id->npwa = id->npwg; + /* NPDG = Namespace Preferred Deallocate Granularity. 0's based */ + id->npdg = to0based(ql->discard_granularity / ql->logical_block_size); + /* NPDG = Namespace Preferred Deallocate Alignment */ + id->npda = id->npdg; + /* NOWS = Namespace Optimal Write Size */ + id->nows = to0based(ql->io_opt / ql->logical_block_size); +} + int nvmet_bdev_ns_enable(struct nvmet_ns *ns) { int ret; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index dc270944bb25..6ee66c610739 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -365,6 +365,7 @@ u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask); void nvmet_execute_async_event(struct nvmet_req *req); u16 nvmet_parse_connect_cmd(struct nvmet_req *req); +void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id); u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req); u16 nvmet_file_parse_io_cmd(struct nvmet_req *req); u16 nvmet_parse_admin_cmd(struct nvmet_req *req); @@ -492,4 +493,11 @@ static inline u32 nvmet_rw_len(struct nvmet_req *req) } u16 errno_to_nvme_status(struct nvmet_req *req, int errno); + +/* Convert a 32-bit number to a 16-bit 0's based number */ +static inline __le16 to0based(u32 a) +{ + return cpu_to_le16(max(1U, min(1U << 16, a)) - 1); +} + #endif /* _NVMET_H */ From 6605bdd59c21bb34c8f14ac4d6f2d419185f3528 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 28 Jun 2019 09:53:29 -0700 Subject: [PATCH 11/42] nvme: add I/O characteristics fields Several new fields have been introduced in version 1.4 of the NVMe spec at offsets that were defined as reserved in version 1.3d of the NVMe spec. Update the definition of the nvme_id_ns data structure such that it is in sync with version 1.4 of the NVMe spec. This change preserves backwards compatibility. Signed-off-by: Bart Van Assche Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index d98b2d8baf4e..01aa6a6c241d 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -315,7 +315,7 @@ struct nvme_id_ns { __u8 nmic; __u8 rescap; __u8 fpi; - __u8 rsvd33; + __u8 dlfeat; __le16 nawun; __le16 nawupf; __le16 nacwu; @@ -324,11 +324,17 @@ struct nvme_id_ns { __le16 nabspf; __le16 noiob; __u8 nvmcap[16]; - __u8 rsvd64[28]; + __le16 npwg; + __le16 npwa; + __le16 npdg; + __le16 npda; + __le16 nows; + __u8 rsvd74[18]; __le32 anagrpid; __u8 rsvd96[3]; __u8 nsattr; - __u8 rsvd100[4]; + __le16 nvmsetid; + __le16 endgid; __u8 nguid[16]; __u8 eui64[8]; struct nvme_lbaf lbaf[16]; From 81adb863349157c67ccec871e5ae5574600c50be Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 28 Jun 2019 09:53:31 -0700 Subject: [PATCH 12/42] nvme: set physical block size and optimal I/O size >From the NVMe 1.4 spec: NSFEAT bit 4 if set to 1: indicates that the fields NPWG, NPWA, NPDG, NPDA, and NOWS are defined for this namespace and should be used by the host for I/O optimization; [ ... ] Namespace Preferred Write Granularity (NPWG): This field indicates the smallest recommended write granularity in logical blocks for this namespace. This is a 0's based value. The size indicated should be less than or equal to Maximum Data Transfer Size (MDTS) that is specified in units of minimum memory page size. The value of this field may change if the namespace is reformatted. The size should be a multiple of Namespace Preferred Write Alignment (NPWA). Refer to section 8.25 for how this field is utilized to improve performance and endurance. [ ... ] Each Write, Write Uncorrectable, or Write Zeroes commands should address a multiple of Namespace Preferred Write Granularity (NPWG) (refer to Figure 245) and Stream Write Size (SWS) (refer to Figure 515) logical blocks (as expressed in the NLB field), and the SLBA field of the command should be aligned to Namespace Preferred Write Alignment (NPWA) (refer to Figure 245) for best performance. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 34 ++++++++++++++++++++++++++++++++-- drivers/nvme/host/nvme.h | 1 + 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b2dd4e391f5c..5417110cbf1b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1626,6 +1626,7 @@ static void nvme_update_disk_info(struct gendisk *disk, { sector_t capacity = le64_to_cpu(id->nsze) << (ns->lba_shift - 9); unsigned short bs = 1 << ns->lba_shift; + u32 atomic_bs, phys_bs, io_opt; if (ns->lba_shift > PAGE_SHIFT) { /* unsupported block size, set capacity to 0 later */ @@ -1634,9 +1635,37 @@ static void nvme_update_disk_info(struct gendisk *disk, blk_mq_freeze_queue(disk->queue); blk_integrity_unregister(disk); + if (id->nabo == 0) { + /* + * Bit 1 indicates whether NAWUPF is defined for this namespace + * and whether it should be used instead of AWUPF. If NAWUPF == + * 0 then AWUPF must be used instead. + */ + if (id->nsfeat & (1 << 1) && id->nawupf) + atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; + else + atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; + } else { + atomic_bs = bs; + } + phys_bs = bs; + io_opt = bs; + if (id->nsfeat & (1 << 4)) { + /* NPWG = Namespace Preferred Write Granularity */ + phys_bs *= 1 + le16_to_cpu(id->npwg); + /* NOWS = Namespace Optimal Write Size */ + io_opt *= 1 + le16_to_cpu(id->nows); + } + blk_queue_logical_block_size(disk->queue, bs); - blk_queue_physical_block_size(disk->queue, bs); - blk_queue_io_min(disk->queue, bs); + /* + * Linux filesystems assume writing a single physical block is + * an atomic operation. Hence limit the physical block size to the + * value of the Atomic Write Unit Power Fail parameter. + */ + blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs)); + blk_queue_io_min(disk->queue, phys_bs); + blk_queue_io_opt(disk->queue, io_opt); if (ns->ms && !ns->ext && (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) @@ -2433,6 +2462,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev)); subsys->vendor_id = le16_to_cpu(id->vid); subsys->cmic = id->cmic; + subsys->awupf = le16_to_cpu(id->awupf); #ifdef CONFIG_NVME_MULTIPATH subsys->iopolicy = NVME_IOPOLICY_NUMA; #endif diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ea45d7d393ad..716a876119c8 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -283,6 +283,7 @@ struct nvme_subsystem { char firmware_rev[8]; u8 cmic; u16 vendor_id; + u16 awupf; /* 0's based awupf value. */ struct ida ns_ida; #ifdef CONFIG_NVME_MULTIPATH enum nvme_iopolicy iopolicy; From ca7ae5c966bd4c00626d6ba05d68219f3c1fba36 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 4 Jul 2019 08:10:46 +0200 Subject: [PATCH 13/42] nvme-multipath: factor out a nvme_path_is_disabled helper Factor our a common helper to check if a path has been disabled by something other than the per-namespace ANA state. Signed-off-by: Hannes Reinecke [hch: split from a bigger patch] Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 499acf07d61a..5a6dbb422a9c 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -123,14 +123,19 @@ void nvme_mpath_clear_current_path(struct nvme_ns *ns) } } +static bool nvme_path_is_disabled(struct nvme_ns *ns) +{ + return ns->ctrl->state != NVME_CTRL_LIVE || + test_bit(NVME_NS_ANA_PENDING, &ns->flags); +} + static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) { int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; struct nvme_ns *found = NULL, *fallback = NULL, *ns; list_for_each_entry_rcu(ns, &head->list, siblings) { - if (ns->ctrl->state != NVME_CTRL_LIVE || - test_bit(NVME_NS_ANA_PENDING, &ns->flags)) + if (nvme_path_is_disabled(ns)) continue; if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA) @@ -184,8 +189,7 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, for (ns = nvme_next_ns(head, old); ns != old; ns = nvme_next_ns(head, ns)) { - if (ns->ctrl->state != NVME_CTRL_LIVE || - test_bit(NVME_NS_ANA_PENDING, &ns->flags)) + if (nvme_path_is_disabled(ns)) continue; if (ns->ana_state == NVME_ANA_OPTIMIZED) { From 2032d074716a811440aa9cd2e971a0716646d6af Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 4 Jul 2019 08:10:46 +0200 Subject: [PATCH 14/42] nvme-multipath: also check for a disabled path if there is a single sibling When we have a singular list in nvme_round_robin_path() we still need to check its validity. Signed-off-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 5a6dbb422a9c..9b6dc11fa559 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -183,8 +183,11 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, { struct nvme_ns *ns, *found, *fallback = NULL; - if (list_is_singular(&head->list)) + if (list_is_singular(&head->list)) { + if (nvme_path_is_disabled(old)) + return NULL; return old; + } for (ns = nvme_next_ns(head, old); ns != old; From 04e70bd4a0264a3d488a9eff6e116d7dc9a77967 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 4 Jul 2019 08:10:47 +0200 Subject: [PATCH 15/42] nvme-multipath: do not select namespaces which are about to be removed nvme_ns_remove() will first set the NVME_NS_REMOVING flag before removing it from the list at the very last step. So to avoid selecting a namespace in nvme_find_path() which is about to be removed check the NVME_NS_REMOVING flag, too, when selecting a new path. Signed-off-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 9b6dc11fa559..a9a927677970 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -126,7 +126,8 @@ void nvme_mpath_clear_current_path(struct nvme_ns *ns) static bool nvme_path_is_disabled(struct nvme_ns *ns) { return ns->ctrl->state != NVME_CTRL_LIVE || - test_bit(NVME_NS_ANA_PENDING, &ns->flags); + test_bit(NVME_NS_ANA_PENDING, &ns->flags) || + test_bit(NVME_NS_REMOVING, &ns->flags); } static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) From 5ba895033b8e8257451e6f85e6e516c3b3ce1a68 Mon Sep 17 00:00:00 2001 From: Mikhail Skorzhinskii Date: Thu, 4 Jul 2019 10:01:48 +0200 Subject: [PATCH 16/42] nvmet: print a hint while rejecting NSID 0 or 0xffffffff Adding this hint for the sake of convenience. It was spotted that a few times people spent some time before understanding what is exactly wrong in configuration process. This should save a few time in such situations, especially for people who is not very confident with NVMe requirements. Signed-off-by: Mikhail Skorzhinskii Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 08dd5af357f7..cd52b9f15376 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -588,8 +588,10 @@ static struct config_group *nvmet_ns_make(struct config_group *group, goto out; ret = -EINVAL; - if (nsid == 0 || nsid == NVME_NSID_ALL) + if (nsid == 0 || nsid == NVME_NSID_ALL) { + pr_err("invalid nsid %#x", nsid); goto out; + } ret = -ENOMEM; ns = nvmet_ns_alloc(subsys, nsid); From 958f2a0f8121ae36a5cbff383ab94fadf1fba5eb Mon Sep 17 00:00:00 2001 From: Mikhail Skorzhinskii Date: Thu, 4 Jul 2019 09:59:18 +0200 Subject: [PATCH 17/42] nvme-tcp: set the STABLE_WRITES flag when data digests are enabled There was a few false alarms sighted on target side about wrong data digest while performing high throughput load to XFS filesystem shared through NVMoF TCP. This flag tells the rest of the kernel to ensure that the data buffer does not change while the write is in flight. It incurs a performance penalty, so only enable it when it is actually needed, i.e. when we are calculating data digests. Although even with this change in place, ext2 users can steel experience false positives, as ext2 is not respecting this flag. This may be apply to vfat as well. Signed-off-by: Mikhail Skorzhinskii Signed-off-by: Mike Playle Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5417110cbf1b..f4340dc1d399 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -3304,6 +3305,10 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) goto out_free_ns; } + if (ctrl->opts->data_digest) + ns->queue->backing_dev_info->capabilities + |= BDI_CAP_STABLE_WRITES; + blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue); if (ctrl->ops->flags & NVME_F_PCI_P2PDMA) blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue); From 37c15219599f7a4baa73f6e3432afc69ba7cc530 Mon Sep 17 00:00:00 2001 From: Mikhail Skorzhinskii Date: Mon, 8 Jul 2019 12:31:29 +0200 Subject: [PATCH 18/42] nvme-tcp: don't use sendpage for SLAB pages According to commit a10674bf2406 ("tcp: detecting the misuse of .sendpage for Slab objects") and previous discussion, tcp_sendpage should not be used for pages that is managed by SLAB, as SLAB is not taking page reference counters into consideration. Signed-off-by: Mikhail Skorzhinskii Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 08a2501b9357..606b13d35d16 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -860,7 +860,14 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) else flags |= MSG_MORE; - ret = kernel_sendpage(queue->sock, page, offset, len, flags); + /* can't zcopy slab pages */ + if (unlikely(PageSlab(page))) { + ret = sock_no_sendpage(queue->sock, page, offset, len, + flags); + } else { + ret = kernel_sendpage(queue->sock, page, offset, len, + flags); + } if (ret <= 0) return ret; From 4c73cbdff1119d088ed16d63def59ad32b11b18f Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 28 Jun 2019 17:26:08 -0700 Subject: [PATCH 19/42] nvme-fc: fix module unloads while lports still pending Current code allows the module to be unloaded even if there are pending data structures, such as localports and controllers on the localports, that have yet to hit their reference counting to remove them. Fix by having exit entrypoint explicitly delete every controller, which in turn will remove references on the remoteports and localports causing them to be deleted as well. The exit entrypoint, after initiating the deletes, will wait for the last localport to be deleted before continuing. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 51 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 9b497d785ed7..1a391aa1f7d5 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -204,6 +204,9 @@ static DEFINE_IDA(nvme_fc_ctrl_cnt); static struct workqueue_struct *nvme_fc_wq; +static bool nvme_fc_waiting_to_unload; +static DECLARE_COMPLETION(nvme_fc_unload_proceed); + /* * These items are short-term. They will eventually be moved into * a generic FC class. See comments in module init. @@ -229,6 +232,8 @@ nvme_fc_free_lport(struct kref *ref) /* remove from transport list */ spin_lock_irqsave(&nvme_fc_lock, flags); list_del(&lport->port_list); + if (nvme_fc_waiting_to_unload && list_empty(&nvme_fc_lport_list)) + complete(&nvme_fc_unload_proceed); spin_unlock_irqrestore(&nvme_fc_lock, flags); ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num); @@ -3456,11 +3461,51 @@ static int __init nvme_fc_init_module(void) return ret; } +static void +nvme_fc_delete_controllers(struct nvme_fc_rport *rport) +{ + struct nvme_fc_ctrl *ctrl; + + spin_lock(&rport->lock); + list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) { + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: transport unloading: deleting ctrl\n", + ctrl->cnum); + nvme_delete_ctrl(&ctrl->ctrl); + } + spin_unlock(&rport->lock); +} + +static void +nvme_fc_cleanup_for_unload(void) +{ + struct nvme_fc_lport *lport; + struct nvme_fc_rport *rport; + + list_for_each_entry(lport, &nvme_fc_lport_list, port_list) { + list_for_each_entry(rport, &lport->endp_list, endp_list) { + nvme_fc_delete_controllers(rport); + } + } +} + static void __exit nvme_fc_exit_module(void) { - /* sanity check - all lports should be removed */ - if (!list_empty(&nvme_fc_lport_list)) - pr_warn("%s: localport list not empty\n", __func__); + unsigned long flags; + bool need_cleanup = false; + + spin_lock_irqsave(&nvme_fc_lock, flags); + nvme_fc_waiting_to_unload = true; + if (!list_empty(&nvme_fc_lport_list)) { + need_cleanup = true; + nvme_fc_cleanup_for_unload(); + } + spin_unlock_irqrestore(&nvme_fc_lock, flags); + if (need_cleanup) { + pr_info("%s: waiting for ctlr deletes\n", __func__); + wait_for_completion(&nvme_fc_unload_proceed); + pr_info("%s: ctrl deletes complete\n", __func__); + } nvmf_unregister_transport(&nvme_fc_transport); From b554db147feea39617b533ab6bca247c91c6198a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 7 Mar 2019 21:37:18 +0000 Subject: [PATCH 20/42] block: init flush rq ref count to 1 We discovered a problem in newer kernels where a disconnect of a NBD device while the flush request was pending would result in a hang. This is because the blk mq timeout handler does if (!refcount_inc_not_zero(&rq->ref)) return true; to determine if it's ok to run the timeout handler for the request. Flush_rq's don't have a ref count set, so we'd skip running the timeout handler for this request and it would just sit there in limbo forever. Fix this by always setting the refcount of any request going through blk_init_rq() to 1. I tested this with a nbd-server that dropped flush requests to verify that it hung, and then tested with this patch to verify I got the timeout as expected and the error handling kicked in. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-core.c b/block/blk-core.c index 5d1fc8e17dd1..edd009213f5b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -117,6 +117,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->internal_tag = -1; rq->start_time_ns = ktime_get_ns(); rq->part = NULL; + refcount_set(&rq->ref, 1); } EXPORT_SYMBOL(blk_rq_init); From fd112c74652371a023f85d87b70bee7169e8f4d0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 9 Jul 2019 14:41:29 -0700 Subject: [PATCH 21/42] blk-cgroup: turn on psi memstall stuff With the psi stuff in place we can use the memstall flag to indicate pressure that happens from throttling. Signed-off-by: Josef Bacik Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 53b7bd4c7000..8afa52b0d148 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "blk.h" #define MAX_KEY_LEN 100 @@ -1587,6 +1588,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) */ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { + unsigned long pflags; u64 now = ktime_to_ns(ktime_get()); u64 exp; u64 delay_nsec = 0; @@ -1613,11 +1615,8 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) */ delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); - /* - * TODO: the use_memdelay flag is going to be for the upcoming psi stuff - * that hasn't landed upstream yet. Once that stuff is in place we need - * to do a psi_memstall_enter/leave if memdelay is set. - */ + if (use_memdelay) + psi_memstall_enter(&pflags); exp = ktime_add_ns(now, delay_nsec); tok = io_schedule_prepare(); @@ -1627,6 +1626,9 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) break; } while (!fatal_signal_pending(current)); io_schedule_finish(tok); + + if (use_memdelay) + psi_memstall_leave(&pflags); } /** From 9b0eb69b75bccada2d341d7e7ca342f0cb1c9a6a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 27 Jun 2019 13:39:48 -0700 Subject: [PATCH 22/42] cgroup, blkcg: Prepare some symbols for module and !CONFIG_CGROUP usages btrfs is going to use css_put() and wbc helpers to improve cgroup writeback support. Add dummy css_get() definition and export wbc helpers to prepare for module and !CONFIG_CGROUP builds. Reported-by: kbuild test robot Reviewed-by: Jan Kara Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 1 + fs/fs-writeback.c | 3 +++ include/linux/cgroup.h | 1 + 3 files changed, 5 insertions(+) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 8afa52b0d148..ad7a91dec934 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -48,6 +48,7 @@ struct blkcg blkcg_root; EXPORT_SYMBOL_GPL(blkcg_root); struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; +EXPORT_SYMBOL_GPL(blkcg_root_css); static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 9ebfb1b28430..a8a40bc26c2f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -270,6 +270,7 @@ void __inode_attach_wb(struct inode *inode, struct page *page) if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) wb_put(wb); } +EXPORT_SYMBOL_GPL(__inode_attach_wb); /** * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it @@ -582,6 +583,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, if (unlikely(wb_dying(wbc->wb))) inode_switch_wbs(inode, wbc->wb_id); } +EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode); /** * wbc_detach_inode - disassociate wbc from inode and perform foreign detection @@ -701,6 +703,7 @@ void wbc_detach_inode(struct writeback_control *wbc) wb_put(wbc->wb); wbc->wb = NULL; } +EXPORT_SYMBOL_GPL(wbc_detach_inode); /** * wbc_account_io - account IO issued during writeback diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 3745ecdad925..852d885df10a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -699,6 +699,7 @@ void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, struct cgroup_subsys_state; struct cgroup; +static inline void css_get(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {} static inline int cgroup_attach_task_all(struct task_struct *from, struct task_struct *t) { return 0; } From 34e51a5e1a6e939ed7d99c38173821ab86d577f4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 27 Jun 2019 13:39:49 -0700 Subject: [PATCH 23/42] blkcg, writeback: Rename wbc_account_io() to wbc_account_cgroup_owner() wbc_account_io() does a very specific job - try to see which cgroup is actually dirtying an inode and transfer its ownership to the majority dirtier if needed. The name is too generic and confusing. Let's rename it to something more specific. Reviewed-by: Jan Kara Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- Documentation/admin-guide/cgroup-v2.rst | 2 +- fs/btrfs/extent_io.c | 4 ++-- fs/buffer.c | 2 +- fs/ext4/page-io.c | 2 +- fs/f2fs/data.c | 4 ++-- fs/fs-writeback.c | 8 ++++---- fs/mpage.c | 2 +- include/linux/writeback.h | 8 ++++---- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index a5c845338d6d..6223f485f7e1 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2114,7 +2114,7 @@ following two functions. a queue (device) has been associated with the bio and before submission. - wbc_account_io(@wbc, @page, @bytes) + wbc_account_cgroup_owner(@wbc, @page, @bytes) Should be called for each data segment being written out. While this function doesn't care exactly when it's called during the writeback session, it's the easiest and most diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index db337e53aab3..5106008f5e28 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2911,7 +2911,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, bio = NULL; } else { if (wbc) - wbc_account_io(wbc, page, page_size); + wbc_account_cgroup_owner(wbc, page, page_size); return 0; } } @@ -2924,7 +2924,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, bio->bi_opf = opf; if (wbc) { wbc_init_bio(wbc, bio); - wbc_account_io(wbc, page, page_size); + wbc_account_cgroup_owner(wbc, page, page_size); } *bio_ret = bio; diff --git a/fs/buffer.c b/fs/buffer.c index e450c55f6434..40547bbbea94 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3093,7 +3093,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, if (wbc) { wbc_init_bio(wbc, bio); - wbc_account_io(wbc, bh->b_page, bh->b_size); + wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size); } submit_bio(bio); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 4690618a92e9..56e287f5ee50 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -404,7 +404,7 @@ static int io_submit_add_bh(struct ext4_io_submit *io, ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) goto submit_and_retry; - wbc_account_io(io->io_wbc, page, bh->b_size); + wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size); io->io_next_block++; return 0; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index eda4181d2092..e1cab1717ac7 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -470,7 +470,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) } if (fio->io_wbc && !is_read_io(fio->op)) - wbc_account_io(fio->io_wbc, page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE); bio_set_op_attrs(bio, fio->op, fio->op_flags); @@ -537,7 +537,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) } if (fio->io_wbc) - wbc_account_io(fio->io_wbc, bio_page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, bio_page, PAGE_SIZE); io->last_block_in_bio = fio->new_blkaddr; f2fs_trace_ios(fio, 0); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a8a40bc26c2f..0aef79e934bb 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -706,7 +706,7 @@ void wbc_detach_inode(struct writeback_control *wbc) EXPORT_SYMBOL_GPL(wbc_detach_inode); /** - * wbc_account_io - account IO issued during writeback + * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership * @wbc: writeback_control of the writeback in progress * @page: page being written out * @bytes: number of bytes being written out @@ -715,8 +715,8 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode); * controlled by @wbc. Keep the book for foreign inode detection. See * wbc_detach_inode(). */ -void wbc_account_io(struct writeback_control *wbc, struct page *page, - size_t bytes) +void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, + size_t bytes) { struct cgroup_subsys_state *css; int id; @@ -753,7 +753,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page, else wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); } -EXPORT_SYMBOL_GPL(wbc_account_io); +EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner); /** * inode_congested - test whether an inode is congested diff --git a/fs/mpage.c b/fs/mpage.c index 436a85260394..a63620cdb73a 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -647,7 +647,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ - wbc_account_io(wbc, page, PAGE_SIZE); + wbc_account_cgroup_owner(wbc, page, PAGE_SIZE); length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 738a0c24874f..dda5cf228172 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -188,8 +188,8 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) __releases(&inode->i_lock); void wbc_detach_inode(struct writeback_control *wbc); -void wbc_account_io(struct writeback_control *wbc, struct page *page, - size_t bytes); +void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, + size_t bytes); void cgroup_writeback_umount(void); /** @@ -291,8 +291,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) { } -static inline void wbc_account_io(struct writeback_control *wbc, - struct page *page, size_t bytes) +static inline void wbc_account_cgroup_owner(struct writeback_control *wbc, + struct page *page, size_t bytes) { } From 27b36d8fa81fa8274fb72f4eb1484026f6b6daa8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 27 Jun 2019 13:39:50 -0700 Subject: [PATCH 24/42] blkcg, writeback: Add wbc->no_cgroup_owner When writeback IOs are bounced through async layers, the IOs should only be accounted against the wbc from the original bdi writeback to avoid confusing cgroup inode ownership arbitration. Add wbc->no_cgroup_owner to allow disabling wbc cgroup owner accounting. This will be used make btrfs compression work well with cgroup IO control. v2: Renamed from no_wbc_acct to no_cgroup_owner and added comment as per Jan. Reviewed-by: Josef Bacik Reviewed-by: Jan Kara Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 2 +- include/linux/writeback.h | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 0aef79e934bb..542b02d170f8 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -727,7 +727,7 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, * behind a slow cgroup. Ultimately, we want pageout() to kick off * regular writeback instead of writing things out itself. */ - if (!wbc->wb) + if (!wbc->wb || wbc->no_cgroup_owner) return; css = mem_cgroup_css_from_page(page); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index dda5cf228172..33a50fa09fac 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -68,6 +68,15 @@ struct writeback_control { unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ + + /* + * When writeback IOs are bounced through async layers, only the + * initial synchronous phase should be accounted towards inode + * cgroup ownership arbitration to avoid confusion. Later stages + * can set the following flag to disable the accounting. + */ + unsigned no_cgroup_owner:1; + #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb; /* wb this writeback is issued under */ struct inode *inode; /* inode being written out */ From 653c45c6b90c9659facbef10546d1f3a8e37d0cf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 27 Jun 2019 13:39:51 -0700 Subject: [PATCH 25/42] blkcg, writeback: Implement wbc_blkcg_css() Add a helper to determine the target blkcg from wbc. Reviewed-by: Josef Bacik Reviewed-by: Jan Kara Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/writeback.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 33a50fa09fac..e056a22075cf 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -11,6 +11,7 @@ #include #include #include +#include struct bio; @@ -101,6 +102,16 @@ static inline int wbc_to_write_flags(struct writeback_control *wbc) return 0; } +static inline struct cgroup_subsys_state * +wbc_blkcg_css(struct writeback_control *wbc) +{ +#ifdef CONFIG_CGROUP_WRITEBACK + if (wbc->wb) + return wbc->wb->blkcg_css; +#endif + return blkcg_root_css; +} + /* * A wb_domain represents a domain that wb's (bdi_writeback's) belong to * and are measured against each other in. There always is one global From d3f77dfdc71835f8db71ca57d272b1fbec9dfc18 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 27 Jun 2019 13:39:52 -0700 Subject: [PATCH 26/42] blkcg: implement REQ_CGROUP_PUNT When a shared kthread needs to issue a bio for a cgroup, doing so synchronously can lead to priority inversions as the kthread can be trapped waiting for that cgroup. This patch implements REQ_CGROUP_PUNT flag which makes submit_bio() punt the actual issuing to a dedicated per-blkcg work item to avoid such priority inversions. This will be used to fix priority inversions in btrfs compression and should be generally useful as we grow filesystem support for comprehensive IO control. Cc: Chris Mason Reviewed-by: Josef Bacik Reviewed-by: Jan Kara Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 53 +++++++++++++++++++++++++++++++++++++ block/blk-core.c | 3 +++ include/linux/backing-dev.h | 1 + include/linux/blk-cgroup.h | 16 ++++++++++- include/linux/blk_types.h | 10 +++++++ include/linux/writeback.h | 17 ++++++++---- 6 files changed, 94 insertions(+), 6 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index ad7a91dec934..24ed26957367 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -55,6 +55,7 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ static bool blkcg_debug_stats = false; +static struct workqueue_struct *blkcg_punt_bio_wq; static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) @@ -89,6 +90,8 @@ static void __blkg_release(struct rcu_head *rcu) { struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); + WARN_ON(!bio_list_empty(&blkg->async_bios)); + /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); if (blkg->parent) @@ -114,6 +117,23 @@ static void blkg_release(struct percpu_ref *ref) call_rcu(&blkg->rcu_head, __blkg_release); } +static void blkg_async_bio_workfn(struct work_struct *work) +{ + struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, + async_bio_work); + struct bio_list bios = BIO_EMPTY_LIST; + struct bio *bio; + + /* as long as there are pending bios, @blkg can't go away */ + spin_lock_bh(&blkg->async_bio_lock); + bio_list_merge(&bios, &blkg->async_bios); + bio_list_init(&blkg->async_bios); + spin_unlock_bh(&blkg->async_bio_lock); + + while ((bio = bio_list_pop(&bios))) + submit_bio(bio); +} + /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with @@ -142,6 +162,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); + spin_lock_init(&blkg->async_bio_lock); + bio_list_init(&blkg->async_bios); + INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); blkg->blkcg = blkcg; for (i = 0; i < BLKCG_MAX_POLS; i++) { @@ -1528,6 +1551,25 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) } EXPORT_SYMBOL_GPL(blkcg_policy_unregister); +bool __blkcg_punt_bio_submit(struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + + /* consume the flag first */ + bio->bi_opf &= ~REQ_CGROUP_PUNT; + + /* never bounce for the root cgroup */ + if (!blkg->parent) + return false; + + spin_lock_bh(&blkg->async_bio_lock); + bio_list_add(&blkg->async_bios, bio); + spin_unlock_bh(&blkg->async_bio_lock); + + queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work); + return true; +} + /* * Scale the accumulated delay based on how long it has been since we updated * the delay. We only call this when we are adding delay, in case it's been a @@ -1729,5 +1771,16 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) atomic64_add(delta, &blkg->delay_nsec); } +static int __init blkcg_init(void) +{ + blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", + WQ_MEM_RECLAIM | WQ_FREEZABLE | + WQ_UNBOUND | WQ_SYSFS, 0); + if (!blkcg_punt_bio_wq) + return -ENOMEM; + return 0; +} +subsys_initcall(blkcg_init); + module_param(blkcg_debug_stats, bool, 0644); MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); diff --git a/block/blk-core.c b/block/blk-core.c index edd009213f5b..260e36a2c343 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1128,6 +1128,9 @@ EXPORT_SYMBOL_GPL(direct_make_request); */ blk_qc_t submit_bio(struct bio *bio) { + if (blkcg_punt_bio_submit(bio)) + return BLK_QC_T_NONE; + /* * If it's a regular read/write or a barrier with data attached, * go through the normal accounting stuff before submission. diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index f9b029180241..35b31d176f74 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -48,6 +48,7 @@ extern spinlock_t bdi_lock; extern struct list_head bdi_list; extern struct workqueue_struct *bdi_wq; +extern struct workqueue_struct *bdi_async_bio_wq; static inline bool wb_has_dirty_io(struct bdi_writeback *wb) { diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 33f23a858438..689a58231288 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -132,13 +132,17 @@ struct blkcg_gq { struct blkg_policy_data *pd[BLKCG_MAX_POLS]; - struct rcu_head rcu_head; + spinlock_t async_bio_lock; + struct bio_list async_bios; + struct work_struct async_bio_work; atomic_t use_delay; atomic64_t delay_nsec; atomic64_t delay_start; u64 last_delay; int last_use; + + struct rcu_head rcu_head; }; typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); @@ -701,6 +705,15 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg struct bio *bio) { return false; } #endif +bool __blkcg_punt_bio_submit(struct bio *bio); + +static inline bool blkcg_punt_bio_submit(struct bio *bio) +{ + if (bio->bi_opf & REQ_CGROUP_PUNT) + return __blkcg_punt_bio_submit(bio); + else + return false; +} static inline void blkcg_bio_issue_init(struct bio *bio) { @@ -848,6 +861,7 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { } +static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; } static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline bool blkcg_bio_issue_check(struct request_queue *q, struct bio *bio) { return true; } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 6a53799c3fe2..feff3fe4467e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -311,6 +311,14 @@ enum req_flag_bits { __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_BACKGROUND, /* background IO */ __REQ_NOWAIT, /* Don't wait if request will block */ + /* + * When a shared kthread needs to issue a bio for a cgroup, doing + * so synchronously can lead to priority inversions as the kthread + * can be trapped waiting for that cgroup. CGROUP_PUNT flag makes + * submit_bio() punt the actual issuing to a dedicated per-blkcg + * work item to avoid such priority inversions. + */ + __REQ_CGROUP_PUNT, /* command specific flags for REQ_OP_WRITE_ZEROES: */ __REQ_NOUNMAP, /* do not free blocks when zeroing */ @@ -337,6 +345,8 @@ enum req_flag_bits { #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) #define REQ_NOWAIT (1ULL << __REQ_NOWAIT) +#define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT) + #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) #define REQ_HIPRI (1ULL << __REQ_HIPRI) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index e056a22075cf..8945aac31392 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -78,6 +78,8 @@ struct writeback_control { */ unsigned no_cgroup_owner:1; + unsigned punt_to_cgroup:1; /* cgrp punting, see __REQ_CGROUP_PUNT */ + #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb; /* wb this writeback is issued under */ struct inode *inode; /* inode being written out */ @@ -94,12 +96,17 @@ struct writeback_control { static inline int wbc_to_write_flags(struct writeback_control *wbc) { - if (wbc->sync_mode == WB_SYNC_ALL) - return REQ_SYNC; - else if (wbc->for_kupdate || wbc->for_background) - return REQ_BACKGROUND; + int flags = 0; - return 0; + if (wbc->punt_to_cgroup) + flags = REQ_CGROUP_PUNT; + + if (wbc->sync_mode == WB_SYNC_ALL) + flags |= REQ_SYNC; + else if (wbc->for_kupdate || wbc->for_background) + flags |= REQ_BACKGROUND; + + return flags; } static inline struct cgroup_subsys_state * From 113ab72ed4794c193509a97d7c6d32a6886e1682 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Jul 2019 13:53:10 +0900 Subject: [PATCH 27/42] block: Fix potential overflow in blk_report_zones() For large values of the number of zones reported and/or large zone sizes, the sector increment calculated with blk_queue_zone_sectors(q) * n in blk_report_zones() loop can overflow the unsigned int type used for the calculation as both "n" and blk_queue_zone_sectors() value are unsigned int. E.g. for a device with 256 MB zones (524288 sectors), overflow happens with 8192 or more zones reported. Changing the return type of blk_queue_zone_sectors() to sector_t, fixes this problem and avoids overflow problem for all other callers of this helper too. The same change is also applied to the bdev_zone_sectors() helper. Fixes: e76239a3748c ("block: add a report_zones method") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-zoned.c | 2 +- include/linux/blkdev.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index ae7e91bd0618..3249738242b4 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -70,7 +70,7 @@ EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); static inline unsigned int __blkdev_nr_zones(struct request_queue *q, sector_t nr_sectors) { - unsigned long zone_sectors = blk_queue_zone_sectors(q); + sector_t zone_sectors = blk_queue_zone_sectors(q); return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0c482371c8b3..259bd7ad8312 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -681,7 +681,7 @@ static inline bool blk_queue_is_zoned(struct request_queue *q) } } -static inline unsigned int blk_queue_zone_sectors(struct request_queue *q) +static inline sector_t blk_queue_zone_sectors(struct request_queue *q) { return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } @@ -1418,7 +1418,7 @@ static inline bool bdev_is_zoned(struct block_device *bdev) return false; } -static inline unsigned int bdev_zone_sectors(struct block_device *bdev) +static inline sector_t bdev_zone_sectors(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); From 3a10f999ffd464d01c5a05592a15470a3c4bbc36 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Jul 2019 18:29:57 +0300 Subject: [PATCH 28/42] blk-throttle: fix zero wait time for iops throttled group After commit 991f61fe7e1d ("Blk-throttle: reduce tail io latency when iops limit is enforced") wait time could be zero even if group is throttled and cannot issue requests right now. As a result throtl_select_dispatch() turns into busy-loop under irq-safe queue spinlock. Fix is simple: always round up target time to the next throttle slice. Fixes: 991f61fe7e1d ("Blk-throttle: reduce tail io latency when iops limit is enforced") Signed-off-by: Konstantin Khlebnikov Cc: stable@vger.kernel.org # v4.19+ Signed-off-by: Jens Axboe --- block/blk-throttle.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 9ea7c0ecad10..8ab6c8153223 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -881,13 +881,10 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; u64 tmp; - jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; + jiffy_elapsed = jiffies - tg->slice_start[rw]; - /* Slice has just started. Consider one slice interval */ - if (!jiffy_elapsed) - jiffy_elapsed_rnd = tg->td->throtl_slice; - - jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); + /* Round up to the next throttle slice, wait time must be nonzero */ + jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); /* * jiffy_elapsed_rnd should not be a big value as minimum iops can be From 420dc733f980246f2179e0144f9cedab9ad4a91e Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 10 Jul 2019 09:31:31 -0700 Subject: [PATCH 29/42] nvme: fix regression upon hot device removal and insertion When we validate the new controller id, we want to skip controllers that are either deleting or dead. Fix the check to do that and not on the newly added controller. Fixes: 1b1031ca63b2 ("nvme: validate cntlid during controller initialisation") Reported-by: Jon Derrick Tested-by: Jon Derrick Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f4340dc1d399..3077cd4d75bf 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2416,8 +2416,8 @@ static bool nvme_validate_cntlid(struct nvme_subsystem *subsys, lockdep_assert_held(&nvme_subsystems_lock); list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) { - if (ctrl->state == NVME_CTRL_DELETING || - ctrl->state == NVME_CTRL_DEAD) + if (tmp->state == NVME_CTRL_DELETING || + tmp->state == NVME_CTRL_DEAD) continue; if (tmp->cntlid == ctrl->cntlid) { From 36847a005489cfb74dc6388952da73346f867dca Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 11 Jul 2019 00:56:08 +0900 Subject: [PATCH 30/42] block: Remove unused definitions The ELV_MQUEUE_XXX definitions in include/linux/elevator.h are unused since the removal of elevator_may_queue_fn in kernel 5.0. Remove these definitions and also remove the documentation of elevator_may_queue_fn in Documentiation/block/biodoc.txt. Acked-by: Marcos Paulo de Souza Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- Documentation/block/biodoc.txt | 5 ----- include/linux/elevator.h | 9 --------- 2 files changed, 14 deletions(-) diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 31c177663ed5..5a4a799fe61b 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -843,11 +843,6 @@ elevator_latter_req_fn These return the request before or after the elevator_completed_req_fn called when a request is completed. -elevator_may_queue_fn returns true if the scheduler wants to allow the - current context to queue a new request even if - it is over the queue limit. This must be used - very carefully!! - elevator_set_req_fn elevator_put_req_fn Must be used to allocate and free any elevator specific storage for a request. diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 169bb2e02516..38590c30a11d 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -160,15 +160,6 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t); #define ELEVATOR_INSERT_FLUSH 5 #define ELEVATOR_INSERT_SORT_MERGE 6 -/* - * return values from elevator_may_queue_fn - */ -enum { - ELV_MQUEUE_MAY, - ELV_MQUEUE_NO, - ELV_MQUEUE_MUST, -}; - #define rq_end_sector(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq)) #define rb_entry_rq(node) rb_entry((node), struct request, rb_node) From 9305d5d721f2bd5e2eeb670035159b560ca211ca Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 11 Jul 2019 00:57:41 +0900 Subject: [PATCH 31/42] block: Fix elevator name declaration The elevator_name field in struct elevator_type is declared as an array of characters (ELV_NAME_MAX size) but in practice used as a string pointer with its initialization done statically within each elevator elevator_type structure declaration. Change the declaration of elevator_name to the more appropriate "const char *" type. Acked-by: Marcos Paulo de Souza Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- include/linux/elevator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 38590c30a11d..17cd0078377c 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -75,7 +75,7 @@ struct elevator_type size_t icq_size; /* see iocontext.h */ size_t icq_align; /* ditto */ struct elv_fs_entry *elevator_attrs; - char elevator_name[ELV_NAME_MAX]; + const char *elevator_name; const char *elevator_alias; struct module *elevator_owner; #ifdef CONFIG_BLK_DEBUG_FS From b49773e7bcf316f238f6709ad9e1999dcc3ed433 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 11 Jul 2019 01:18:31 +0900 Subject: [PATCH 32/42] block: Disable write plugging for zoned block devices Simultaneously writing to a sequential zone of a zoned block device from multiple contexts requires mutual exclusion for BIO issuing to ensure that writes happen sequentially. However, even for a well behaved user correctly implementing such synchronization, BIO plugging may interfere and result in BIOs from the different contextx to be reordered if plugging is done outside of the mutual exclusion section, e.g. the plug was started by a function higher in the call chain than the function issuing BIOs. Context A Context B | blk_start_plug() | ... | seq_write_zone() | mutex_lock(zone) | bio-0->bi_iter.bi_sector = zone->wp | zone->wp += bio_sectors(bio-0) | submit_bio(bio-0) | bio-1->bi_iter.bi_sector = zone->wp | zone->wp += bio_sectors(bio-1) | submit_bio(bio-1) | mutex_unlock(zone) | return | -----------------------> | seq_write_zone() | mutex_lock(zone) | bio-2->bi_iter.bi_sector = zone->wp | zone->wp += bio_sectors(bio-2) | submit_bio(bio-2) | mutex_unlock(zone) | <------------------------- | | blk_finish_plug() In the above example, despite the mutex synchronization ensuring the correct BIO issuing order 0, 1, 2, context A BIOs 0 and 1 end up being issued after BIO 2 of context B, when the plug is released with blk_finish_plug(). While this problem can be addressed using the blk_flush_plug_list() function (in the above example, the call must be inserted before the zone mutex lock is released), a simple generic solution in the block layer avoid this additional code in all zoned block device user code. The simple generic solution implemented with this patch is to introduce the internal helper function blk_mq_plug() to access the current context plug on BIO submission. This helper returns the current plug only if the target device is not a zoned block device or if the BIO to be plugged is not a write operation. Otherwise, the caller context plug is ignored and NULL returned, resulting is all writes to zoned block device to never be plugged. Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-mq.c | 2 +- block/blk-mq.h | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 260e36a2c343..d0cc6e14d2f0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -688,7 +688,7 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, struct request *rq; struct list_head *plug_list; - plug = current->plug; + plug = blk_mq_plug(q, bio); if (!plug) return false; diff --git a/block/blk-mq.c b/block/blk-mq.c index e5ef40c603ca..b038ec680e84 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1973,7 +1973,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_bio_to_request(rq, bio, nr_segs); - plug = current->plug; + plug = blk_mq_plug(q, bio); if (unlikely(is_flush_fua)) { /* bypass scheduler for flush rq */ blk_insert_flush(rq); diff --git a/block/blk-mq.h b/block/blk-mq.h index f4bf5161333e..32c62c64e6c2 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -233,4 +233,36 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) qmap->mq_map[cpu] = 0; } +/* + * blk_mq_plug() - Get caller context plug + * @q: request queue + * @bio : the bio being submitted by the caller context + * + * Plugging, by design, may delay the insertion of BIOs into the elevator in + * order to increase BIO merging opportunities. This however can cause BIO + * insertion order to change from the order in which submit_bio() is being + * executed in the case of multiple contexts concurrently issuing BIOs to a + * device, even if these context are synchronized to tightly control BIO issuing + * order. While this is not a problem with regular block devices, this ordering + * change can cause write BIO failures with zoned block devices as these + * require sequential write patterns to zones. Prevent this from happening by + * ignoring the plug state of a BIO issuing context if the target request queue + * is for a zoned block device and the BIO to plug is a write operation. + * + * Return current->plug if the bio can be plugged and NULL otherwise + */ +static inline struct blk_plug *blk_mq_plug(struct request_queue *q, + struct bio *bio) +{ + /* + * For regular block devices or read operations, use the context plug + * which may be NULL if blk_start_plug() was not executed. + */ + if (!blk_queue_is_zoned(q) || !op_is_write(bio_op(bio))) + return current->plug; + + /* Zoned block device write operation case: do not plug the BIO */ + return NULL; +} + #endif From 553768d1169a48c0cd87c4eb4ab57534ee663415 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 29 May 2019 15:16:05 -0500 Subject: [PATCH 33/42] nbd: fix crash when the blksize is zero This will allow the blksize to be set zero and then use 1024 as default. Reviewed-by: Josef Bacik Signed-off-by: Xiubo Li [fix to use goto out instead of return in genl_connect] Signed-off-by: Mike Christie Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 3a9bca3aa093..5c4e3f2160bf 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -134,6 +134,8 @@ static struct dentry *nbd_dbg_dir; #define NBD_MAGIC 0x68797548 +#define NBD_DEF_BLKSIZE 1024 + static unsigned int nbds_max = 16; static int max_part = 16; static struct workqueue_struct *recv_workqueue; @@ -1236,6 +1238,14 @@ static void nbd_clear_sock_ioctl(struct nbd_device *nbd, nbd_config_put(nbd); } +static bool nbd_is_valid_blksize(unsigned long blksize) +{ + if (!blksize || !is_power_of_2(blksize) || blksize < 512 || + blksize > PAGE_SIZE) + return false; + return true; +} + /* Must be called with config_lock held */ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, unsigned int cmd, unsigned long arg) @@ -1251,8 +1261,9 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, case NBD_SET_SOCK: return nbd_add_socket(nbd, arg, false); case NBD_SET_BLKSIZE: - if (!arg || !is_power_of_2(arg) || arg < 512 || - arg > PAGE_SIZE) + if (!arg) + arg = NBD_DEF_BLKSIZE; + if (!nbd_is_valid_blksize(arg)) return -EINVAL; nbd_size_set(nbd, arg, div_s64(config->bytesize, arg)); @@ -1332,7 +1343,7 @@ static struct nbd_config *nbd_alloc_config(void) atomic_set(&config->recv_threads, 0); init_waitqueue_head(&config->recv_wq); init_waitqueue_head(&config->conn_wait); - config->blksize = 1024; + config->blksize = NBD_DEF_BLKSIZE; atomic_set(&config->live_connections, 0); try_module_get(THIS_MODULE); return config; @@ -1768,6 +1779,12 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) { u64 bsize = nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]); + if (!bsize) + bsize = NBD_DEF_BLKSIZE; + if (!nbd_is_valid_blksize(bsize)) { + ret = -EINVAL; + goto out; + } nbd_size_set(nbd, bsize, div64_u64(config->bytesize, bsize)); } if (info->attrs[NBD_ATTR_TIMEOUT]) { From 4ddeaae8903d703201e493e2d19dc9ac9acf2c76 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Wed, 29 May 2019 15:16:06 -0500 Subject: [PATCH 34/42] nbd: add netlink reconfigure resize support If the device is setup with ioctl we can resize the device after the initial setup, but if the device is setup with netlink we cannot use the resize related ioctls and there is no netlink reconfigure size ATTR handling code. This patch adds netlink reconfigure resize support to match the ioctl interface. Reviewed-by: Josef Bacik Signed-off-by: Mike Christie Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 48 ++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 5c4e3f2160bf..9bcde2325893 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1684,6 +1684,30 @@ nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = { [NBD_DEVICE_CONNECTED] = { .type = NLA_U8 }, }; +static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd) +{ + struct nbd_config *config = nbd->config; + u64 bsize = config->blksize; + u64 bytes = config->bytesize; + + if (info->attrs[NBD_ATTR_SIZE_BYTES]) + bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]); + + if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) { + bsize = nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]); + if (!bsize) + bsize = NBD_DEF_BLKSIZE; + if (!nbd_is_valid_blksize(bsize)) { + printk(KERN_ERR "Invalid block size %llu\n", bsize); + return -EINVAL; + } + } + + if (bytes != config->bytesize || bsize != config->blksize) + nbd_size_set(nbd, bsize, div64_u64(bytes, bsize)); + return 0; +} + static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) { struct nbd_device *nbd = NULL; @@ -1771,22 +1795,10 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) refcount_set(&nbd->config_refs, 1); set_bit(NBD_BOUND, &config->runtime_flags); - if (info->attrs[NBD_ATTR_SIZE_BYTES]) { - u64 bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]); - nbd_size_set(nbd, config->blksize, - div64_u64(bytes, config->blksize)); - } - if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) { - u64 bsize = - nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]); - if (!bsize) - bsize = NBD_DEF_BLKSIZE; - if (!nbd_is_valid_blksize(bsize)) { - ret = -EINVAL; - goto out; - } - nbd_size_set(nbd, bsize, div64_u64(config->bytesize, bsize)); - } + ret = nbd_genl_size_set(info, nbd); + if (ret) + goto out; + if (info->attrs[NBD_ATTR_TIMEOUT]) { u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]); nbd->tag_set.timeout = timeout * HZ; @@ -1955,6 +1967,10 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) goto out; } + ret = nbd_genl_size_set(info, nbd); + if (ret) + goto out; + if (info->attrs[NBD_ATTR_TIMEOUT]) { u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]); nbd->tag_set.timeout = timeout * HZ; From 7d30c81b80ea9b0812d27030a46a5bf4c4e328f5 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Fri, 12 Jul 2019 02:04:47 +0900 Subject: [PATCH 35/42] nvme: fix NULL deref for fabrics options git://git.infradead.org/nvme.git nvme-5.3 branch now causes the following NULL deref oops. Check the ctrl->opts first before the deref. [ 16.337581] BUG: kernel NULL pointer dereference, address: 0000000000000056 [ 16.338551] #PF: supervisor read access in kernel mode [ 16.338551] #PF: error_code(0x0000) - not-present page [ 16.338551] PGD 0 P4D 0 [ 16.338551] Oops: 0000 [#1] SMP PTI [ 16.338551] CPU: 2 PID: 1035 Comm: kworker/u16:5 Not tainted 5.2.0-rc6+ #1 [ 16.338551] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [ 16.338551] Workqueue: nvme-wq nvme_scan_work [nvme_core] [ 16.338551] RIP: 0010:nvme_validate_ns+0xc9/0x7e0 [nvme_core] [ 16.338551] Code: c0 49 89 c5 0f 84 00 07 00 00 48 8b 7b 58 e8 be 48 39 c1 48 3d 00 f0 ff ff 49 89 45 18 0f 87 a4 06 00 00 48 8b 93 70 0a 00 00 <80> 7a 56 00 74 0c 48 8b 40 68 83 48 3c 08 49 8b 45 18 48 89 c6 bf [ 16.338551] RSP: 0018:ffffc900024c7d10 EFLAGS: 00010283 [ 16.338551] RAX: ffff888135a30720 RBX: ffff88813a4fd1f8 RCX: 0000000000000007 [ 16.338551] RDX: 0000000000000000 RSI: ffffffff8256dd38 RDI: ffff888135a30720 [ 16.338551] RBP: 0000000000000001 R08: 0000000000000007 R09: ffff88813aa6a840 [ 16.338551] R10: 0000000000000001 R11: 000000000002d060 R12: ffff88813a4fd1f8 [ 16.338551] R13: ffff88813a77f800 R14: ffff88813aa35180 R15: 0000000000000001 [ 16.338551] FS: 0000000000000000(0000) GS:ffff88813ba80000(0000) knlGS:0000000000000000 [ 16.338551] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 16.338551] CR2: 0000000000000056 CR3: 000000000240a002 CR4: 0000000000360ee0 [ 16.338551] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 16.338551] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 16.338551] Call Trace: [ 16.338551] nvme_scan_work+0x2c0/0x340 [nvme_core] [ 16.338551] ? __switch_to_asm+0x40/0x70 [ 16.338551] ? _raw_spin_unlock_irqrestore+0x18/0x30 [ 16.338551] ? try_to_wake_up+0x408/0x450 [ 16.338551] process_one_work+0x20b/0x3e0 [ 16.338551] worker_thread+0x1f9/0x3d0 [ 16.338551] ? cancel_delayed_work+0xa0/0xa0 [ 16.338551] kthread+0x117/0x120 [ 16.338551] ? kthread_stop+0xf0/0xf0 [ 16.338551] ret_from_fork+0x3a/0x50 [ 16.338551] Modules linked in: nvme nvme_core [ 16.338551] CR2: 0000000000000056 [ 16.338551] ---[ end trace b9bf761a93e62d84 ]--- [ 16.338551] RIP: 0010:nvme_validate_ns+0xc9/0x7e0 [nvme_core] [ 16.338551] Code: c0 49 89 c5 0f 84 00 07 00 00 48 8b 7b 58 e8 be 48 39 c1 48 3d 00 f0 ff ff 49 89 45 18 0f 87 a4 06 00 00 48 8b 93 70 0a 00 00 <80> 7a 56 00 74 0c 48 8b 40 68 83 48 3c 08 49 8b 45 18 48 89 c6 bf [ 16.338551] RSP: 0018:ffffc900024c7d10 EFLAGS: 00010283 [ 16.338551] RAX: ffff888135a30720 RBX: ffff88813a4fd1f8 RCX: 0000000000000007 [ 16.338551] RDX: 0000000000000000 RSI: ffffffff8256dd38 RDI: ffff888135a30720 [ 16.338551] RBP: 0000000000000001 R08: 0000000000000007 R09: ffff88813aa6a840 [ 16.338551] R10: 0000000000000001 R11: 000000000002d060 R12: ffff88813a4fd1f8 [ 16.338551] R13: ffff88813a77f800 R14: ffff88813aa35180 R15: 0000000000000001 [ 16.338551] FS: 0000000000000000(0000) GS:ffff88813ba80000(0000) knlGS:0000000000000000 [ 16.338551] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 16.338551] CR2: 0000000000000056 CR3: 000000000240a002 CR4: 0000000000360ee0 [ 16.338551] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 16.338551] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Fixes: 958f2a0f8121 ("nvme-tcp: set the STABLE_WRITES flag when data digests are enabled") Cc: Christoph Hellwig Cc: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Minwoo Im Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3077cd4d75bf..cc09b81fc7f4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3305,7 +3305,7 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) goto out_free_ns; } - if (ctrl->opts->data_digest) + if (ctrl->opts && ctrl->opts->data_digest) ns->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; From e7bf90e5afe3aa1d1282c1635a49e17a32c4ecec Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Thu, 11 Jul 2019 14:22:02 -0500 Subject: [PATCH 36/42] block/bio-integrity: fix a memory leak bug In bio_integrity_prep(), a kernel buffer is allocated through kmalloc() to hold integrity metadata. Later on, the buffer will be attached to the bio structure through bio_integrity_add_page(), which returns the number of bytes of integrity metadata attached. Due to unexpected situations, bio_integrity_add_page() may return 0. As a result, bio_integrity_prep() needs to be terminated with 'false' returned to indicate this error. However, the allocated kernel buffer is not freed on this execution path, leading to a memory leak. To fix this issue, free the allocated buffer before returning from bio_integrity_prep(). Reviewed-by: Ming Lei Acked-by: Martin K. Petersen Signed-off-by: Wenwen Wang Signed-off-by: Jens Axboe --- block/bio-integrity.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 4db620849515..fb95dbb21dd8 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -276,8 +276,12 @@ bool bio_integrity_prep(struct bio *bio) ret = bio_integrity_add_page(bio, virt_to_page(buf), bytes, offset); - if (ret == 0) - return false; + if (ret == 0) { + printk(KERN_ERR "could not attach integrity payload\n"); + kfree(buf); + status = BLK_STS_RESOURCE; + goto err_end_io; + } if (ret < bytes) break; From b4c5875d36178e8df409bdce232f270cac89fafe Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 1 Jul 2019 14:09:15 +0900 Subject: [PATCH 37/42] block: Allow mapping of vmalloc-ed buffers To allow the SCSI subsystem scsi_execute_req() function to issue requests using large buffers that are better allocated with vmalloc() rather than kmalloc(), modify bio_map_kern() to allow passing a buffer allocated with vmalloc(). To do so, detect vmalloc-ed buffers using is_vmalloc_addr(). For vmalloc-ed buffers, flush the buffer using flush_kernel_vmap_range(), use vmalloc_to_page() instead of virt_to_page() to obtain the pages of the buffer, and invalidate the buffer addresses with invalidate_kernel_vmap_range() on completion of read BIOs. This last point is executed using the function bio_invalidate_vmalloc_pages() which is defined only if the architecture defines ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE, that is, if the architecture actually needs the invalidation done. Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation") Fixes: e76239a3748c ("block: add a report_zones method") Cc: stable@vger.kernel.org Reviewed-by: Martin K. Petersen Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/bio.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index 29cd6cf4da51..299a0e7651ec 100644 --- a/block/bio.c +++ b/block/bio.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "blk.h" @@ -1441,8 +1442,22 @@ void bio_unmap_user(struct bio *bio) bio_put(bio); } +static void bio_invalidate_vmalloc_pages(struct bio *bio) +{ +#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE + if (bio->bi_private && !op_is_write(bio_op(bio))) { + unsigned long i, len = 0; + + for (i = 0; i < bio->bi_vcnt; i++) + len += bio->bi_io_vec[i].bv_len; + invalidate_kernel_vmap_range(bio->bi_private, len); + } +#endif +} + static void bio_map_kern_endio(struct bio *bio) { + bio_invalidate_vmalloc_pages(bio); bio_put(bio); } @@ -1463,6 +1478,8 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long start = kaddr >> PAGE_SHIFT; const int nr_pages = end - start; + bool is_vmalloc = is_vmalloc_addr(data); + struct page *page; int offset, i; struct bio *bio; @@ -1470,6 +1487,11 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, if (!bio) return ERR_PTR(-ENOMEM); + if (is_vmalloc) { + flush_kernel_vmap_range(data, len); + bio->bi_private = data; + } + offset = offset_in_page(kaddr); for (i = 0; i < nr_pages; i++) { unsigned int bytes = PAGE_SIZE - offset; @@ -1480,7 +1502,11 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, if (bytes > len) bytes = len; - if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, + if (!is_vmalloc) + page = virt_to_page(data); + else + page = vmalloc_to_page(data); + if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { /* we don't support partial mappings */ bio_put(bio); From bd976e52725965ddcceb9abecbcc7ca46863665c Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 1 Jul 2019 14:09:16 +0900 Subject: [PATCH 38/42] block: Kill gfp_t argument of blkdev_report_zones() Only GFP_KERNEL and GFP_NOIO are used with blkdev_report_zones(). In preparation of using vmalloc() for large report buffer and zone array allocations used by this function, remove its "gfp_t gfp_mask" argument and rely on the caller context to use memalloc_noio_save/restore() where necessary (block layer zone revalidation and dm-zoned I/O error path). Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-zoned.c | 31 +++++++++++++++++++------------ drivers/block/null_blk.h | 3 +-- drivers/block/null_blk_zoned.c | 3 +-- drivers/md/dm-flakey.c | 5 ++--- drivers/md/dm-linear.c | 5 ++--- drivers/md/dm-zoned-metadata.c | 16 ++++++++++++---- drivers/md/dm.c | 6 ++---- drivers/scsi/sd.h | 3 +-- drivers/scsi/sd_zbc.c | 6 ++---- fs/f2fs/super.c | 4 +--- include/linux/blkdev.h | 5 ++--- include/linux/device-mapper.h | 3 +-- 12 files changed, 46 insertions(+), 44 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 3249738242b4..58ced170b424 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "blk.h" @@ -117,8 +118,7 @@ static bool blkdev_report_zone(struct block_device *bdev, struct blk_zone *rep) } static int blk_report_zones(struct gendisk *disk, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones, - gfp_t gfp_mask) + struct blk_zone *zones, unsigned int *nr_zones) { struct request_queue *q = disk->queue; unsigned int z = 0, n, nrz = *nr_zones; @@ -127,8 +127,7 @@ static int blk_report_zones(struct gendisk *disk, sector_t sector, while (z < nrz && sector < capacity) { n = nrz - z; - ret = disk->fops->report_zones(disk, sector, &zones[z], &n, - gfp_mask); + ret = disk->fops->report_zones(disk, sector, &zones[z], &n); if (ret) return ret; if (!n) @@ -149,17 +148,18 @@ static int blk_report_zones(struct gendisk *disk, sector_t sector, * @sector: Sector from which to report zones * @zones: Array of zone structures where to return the zones information * @nr_zones: Number of zone structures in the zone array - * @gfp_mask: Memory allocation flags (for bio_alloc) * * Description: * Get zone information starting from the zone containing @sector. * The number of zone information reported may be less than the number * requested by @nr_zones. The number of zones actually reported is * returned in @nr_zones. + * The caller must use memalloc_noXX_save/restore() calls to control + * memory allocations done within this function (zone array and command + * buffer allocation by the device driver). */ int blkdev_report_zones(struct block_device *bdev, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones, - gfp_t gfp_mask) + struct blk_zone *zones, unsigned int *nr_zones) { struct request_queue *q = bdev_get_queue(bdev); unsigned int i, nrz; @@ -184,7 +184,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, nrz = min(*nr_zones, __blkdev_nr_zones(q, bdev->bd_part->nr_sects - sector)); ret = blk_report_zones(bdev->bd_disk, get_start_sect(bdev) + sector, - zones, &nrz, gfp_mask); + zones, &nrz); if (ret) return ret; @@ -305,9 +305,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, if (!zones) return -ENOMEM; - ret = blkdev_report_zones(bdev, rep.sector, - zones, &rep.nr_zones, - GFP_KERNEL); + ret = blkdev_report_zones(bdev, rep.sector, zones, &rep.nr_zones); if (ret) goto out; @@ -415,6 +413,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk) unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL; unsigned int i, rep_nr_zones = 0, z = 0, nrz; struct blk_zone *zones = NULL; + unsigned int noio_flag; sector_t sector = 0; int ret = 0; @@ -427,6 +426,12 @@ int blk_revalidate_disk_zones(struct gendisk *disk) return 0; } + /* + * Ensure that all memory allocations in this context are done as + * if GFP_NOIO was specified. + */ + noio_flag = memalloc_noio_save(); + if (!blk_queue_is_zoned(q) || !nr_zones) { nr_zones = 0; goto update; @@ -449,7 +454,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk) while (z < nr_zones) { nrz = min(nr_zones - z, rep_nr_zones); - ret = blk_report_zones(disk, sector, zones, &nrz, GFP_NOIO); + ret = blk_report_zones(disk, sector, zones, &nrz); if (ret) goto out; if (!nrz) @@ -480,6 +485,8 @@ int blk_revalidate_disk_zones(struct gendisk *disk) blk_mq_unfreeze_queue(q); out: + memalloc_noio_restore(noio_flag); + free_pages((unsigned long)zones, get_order(rep_nr_zones * sizeof(struct blk_zone))); kfree(seq_zones_wlock); diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index 34b22d6523ba..4b9bbe3bb5a1 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -89,8 +89,7 @@ struct nullb { int null_zone_init(struct nullb_device *dev); void null_zone_exit(struct nullb_device *dev); int null_zone_report(struct gendisk *disk, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones, - gfp_t gfp_mask); + struct blk_zone *zones, unsigned int *nr_zones); void null_zone_write(struct nullb_cmd *cmd, sector_t sector, unsigned int nr_sectors); void null_zone_reset(struct nullb_cmd *cmd, sector_t sector); diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index fca0c97ff1aa..cb28d93f2bd1 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -67,8 +67,7 @@ void null_zone_exit(struct nullb_device *dev) } int null_zone_report(struct gendisk *disk, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones, - gfp_t gfp_mask) + struct blk_zone *zones, unsigned int *nr_zones) { struct nullb *nullb = disk->private_data; struct nullb_device *dev = nullb->dev; diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index a9bc518156f2..2900fbde89b3 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -461,15 +461,14 @@ static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev #ifdef CONFIG_BLK_DEV_ZONED static int flakey_report_zones(struct dm_target *ti, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones, - gfp_t gfp_mask) + struct blk_zone *zones, unsigned int *nr_zones) { struct flakey_c *fc = ti->private; int ret; /* Do report and remap it */ ret = blkdev_report_zones(fc->dev->bdev, flakey_map_sector(ti, sector), - zones, nr_zones, gfp_mask); + zones, nr_zones); if (ret != 0) return ret; diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index ad980a38fb1e..ecefe6703736 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -137,15 +137,14 @@ static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev #ifdef CONFIG_BLK_DEV_ZONED static int linear_report_zones(struct dm_target *ti, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones, - gfp_t gfp_mask) + struct blk_zone *zones, unsigned int *nr_zones) { struct linear_c *lc = (struct linear_c *) ti->private; int ret; /* Do report and remap it */ ret = blkdev_report_zones(lc->dev->bdev, linear_map_sector(ti, sector), - zones, nr_zones, gfp_mask); + zones, nr_zones); if (ret != 0) return ret; diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index d8334cd45d7c..9faf3e49c7af 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -8,6 +8,7 @@ #include #include +#include #define DM_MSG_PREFIX "zoned metadata" @@ -1162,8 +1163,7 @@ static int dmz_init_zones(struct dmz_metadata *zmd) while (sector < dev->capacity) { /* Get zone information */ nr_blkz = DMZ_REPORT_NR_ZONES; - ret = blkdev_report_zones(dev->bdev, sector, blkz, - &nr_blkz, GFP_KERNEL); + ret = blkdev_report_zones(dev->bdev, sector, blkz, &nr_blkz); if (ret) { dmz_dev_err(dev, "Report zones failed %d", ret); goto out; @@ -1201,12 +1201,20 @@ static int dmz_init_zones(struct dmz_metadata *zmd) static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) { unsigned int nr_blkz = 1; + unsigned int noio_flag; struct blk_zone blkz; int ret; - /* Get zone information from disk */ + /* + * Get zone information from disk. Since blkdev_report_zones() uses + * GFP_KERNEL by default for memory allocations, set the per-task + * PF_MEMALLOC_NOIO flag so that all allocations are done as if + * GFP_NOIO was specified. + */ + noio_flag = memalloc_noio_save(); ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone), - &blkz, &nr_blkz, GFP_NOIO); + &blkz, &nr_blkz); + memalloc_noio_restore(noio_flag); if (!nr_blkz) ret = -EIO; if (ret) { diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 5475081dcbd6..61f1152b74e9 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -441,8 +441,7 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) } static int dm_blk_report_zones(struct gendisk *disk, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones, - gfp_t gfp_mask) + struct blk_zone *zones, unsigned int *nr_zones) { #ifdef CONFIG_BLK_DEV_ZONED struct mapped_device *md = disk->private_data; @@ -480,8 +479,7 @@ static int dm_blk_report_zones(struct gendisk *disk, sector_t sector, * So there is no need to loop here trying to fill the entire array * of zones. */ - ret = tgt->type->report_zones(tgt, sector, zones, - nr_zones, gfp_mask); + ret = tgt->type->report_zones(tgt, sector, zones, nr_zones); out: dm_put_live_table(md, srcu_idx); diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 5796ace76225..38c50946fc42 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -213,8 +213,7 @@ extern blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd); extern void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, struct scsi_sense_hdr *sshdr); extern int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones, - gfp_t gfp_mask); + struct blk_zone *zones, unsigned int *nr_zones); #else /* CONFIG_BLK_DEV_ZONED */ diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 7334024b64f1..ec3764c8f3f1 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -109,13 +109,11 @@ static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf, * @sector: Start 512B sector of the report * @zones: Array of zone descriptors * @nr_zones: Number of descriptors in the array - * @gfp_mask: Memory allocation mask * * Execute a report zones command on the target disk. */ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones, - gfp_t gfp_mask) + struct blk_zone *zones, unsigned int *nr_zones) { struct scsi_disk *sdkp = scsi_disk(disk); unsigned int i, buflen, nrz = *nr_zones; @@ -134,7 +132,7 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, */ buflen = min(queue_max_hw_sectors(disk->queue) << 9, roundup((nrz + 1) * 64, 512)); - buf = kmalloc(buflen, gfp_mask); + buf = kmalloc(buflen, GFP_KERNEL); if (!buf) return -ENOMEM; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6b959bbb336a..4e91ba6c8a2e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2841,9 +2841,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) while (zones && sector < nr_sectors) { nr_zones = F2FS_REPORT_NR_ZONES; - err = blkdev_report_zones(bdev, sector, - zones, &nr_zones, - GFP_KERNEL); + err = blkdev_report_zones(bdev, sector, zones, &nr_zones); if (err) break; if (!nr_zones) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 259bd7ad8312..05036e3e3458 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -347,7 +347,7 @@ struct queue_limits { extern unsigned int blkdev_nr_zones(struct block_device *bdev); extern int blkdev_report_zones(struct block_device *bdev, sector_t sector, struct blk_zone *zones, - unsigned int *nr_zones, gfp_t gfp_mask); + unsigned int *nr_zones); extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask); extern int blk_revalidate_disk_zones(struct gendisk *disk); @@ -1673,8 +1673,7 @@ struct block_device_operations { /* this callback is with swap_lock and sometimes page table lock held */ void (*swap_slot_free_notify) (struct block_device *, unsigned long); int (*report_zones)(struct gendisk *, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones, - gfp_t gfp_mask); + struct blk_zone *zones, unsigned int *nr_zones); struct module *owner; const struct pr_ops *pr_ops; }; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index e1f51d607cc5..3b470cb03b66 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -95,8 +95,7 @@ typedef int (*dm_prepare_ioctl_fn) (struct dm_target *ti, struct block_device ** typedef int (*dm_report_zones_fn) (struct dm_target *ti, sector_t sector, struct blk_zone *zones, - unsigned int *nr_zones, - gfp_t gfp_mask); + unsigned int *nr_zones); /* * These iteration functions are typically used to check (and combine) From b091ac616846a1da75b1f2566b41255ce7f0e0a6 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 1 Jul 2019 14:09:17 +0900 Subject: [PATCH 39/42] sd_zbc: Fix report zones buffer allocation During disk scan and revalidation done with sd_revalidate(), the zones of a zoned disk are checked using the helper function blk_revalidate_disk_zones() if a configuration change is detected (change in the number of zones or zone size). The function blk_revalidate_disk_zones() issues report_zones calls that are very large, that is, to obtain zone information for all zones of the disk with a single command. The size of the report zones command buffer necessary for such large request generally is lower than the disk max_hw_sectors and KMALLOC_MAX_SIZE (4MB) and succeeds on boot (no memory fragmentation), but often fail at run time (e.g. hot-plug event). This causes the disk revalidation to fail and the disk capacity to be changed to 0. This problem can be avoided by using vmalloc() instead of kmalloc() for the buffer allocation. To limit the amount of memory to be allocated, this patch also introduces the arbitrary SD_ZBC_REPORT_MAX_ZONES maximum number of zones to report with a single report zones command. This limit may be lowered further to satisfy the disk max_hw_sectors limit. Finally, to ensure that the vmalloc-ed buffer can always be mapped in a request, the buffer size is further limited to at most queue_max_segments() pages, allowing successful mapping of the buffer even in the worst case scenario where none of the buffer pages are contiguous. Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation") Fixes: e76239a3748c ("block: add a report_zones method") Cc: stable@vger.kernel.org Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/scsi/sd_zbc.c | 104 ++++++++++++++++++++++++++++++------------ 1 file changed, 75 insertions(+), 29 deletions(-) diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index ec3764c8f3f1..db16c19e05c4 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -9,6 +9,8 @@ */ #include +#include +#include #include @@ -50,7 +52,7 @@ static void sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf, /** * sd_zbc_do_report_zones - Issue a REPORT ZONES scsi command. * @sdkp: The target disk - * @buf: Buffer to use for the reply + * @buf: vmalloc-ed buffer to use for the reply * @buflen: the buffer size * @lba: Start LBA of the report * @partial: Do partial report @@ -79,7 +81,6 @@ static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf, put_unaligned_be32(buflen, &cmd[10]); if (partial) cmd[14] = ZBC_REPORT_ZONE_PARTIAL; - memset(buf, 0, buflen); result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE, buf, buflen, &sshdr, @@ -103,6 +104,53 @@ static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf, return 0; } +/* + * Maximum number of zones to get with one report zones command. + */ +#define SD_ZBC_REPORT_MAX_ZONES 8192U + +/** + * Allocate a buffer for report zones reply. + * @sdkp: The target disk + * @nr_zones: Maximum number of zones to report + * @buflen: Size of the buffer allocated + * + * Try to allocate a reply buffer for the number of requested zones. + * The size of the buffer allocated may be smaller than requested to + * satify the device constraint (max_hw_sectors, max_segments, etc). + * + * Return the address of the allocated buffer and update @buflen with + * the size of the allocated buffer. + */ +static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp, + unsigned int nr_zones, size_t *buflen) +{ + struct request_queue *q = sdkp->disk->queue; + size_t bufsize; + void *buf; + + /* + * Report zone buffer size should be at most 64B times the number of + * zones requested plus the 64B reply header, but should be at least + * SECTOR_SIZE for ATA devices. + * Make sure that this size does not exceed the hardware capabilities. + * Furthermore, since the report zone command cannot be split, make + * sure that the allocated buffer can always be mapped by limiting the + * number of pages allocated to the HBA max segments limit. + */ + nr_zones = min(nr_zones, SD_ZBC_REPORT_MAX_ZONES); + bufsize = roundup((nr_zones + 1) * 64, 512); + bufsize = min_t(size_t, bufsize, + queue_max_hw_sectors(q) << SECTOR_SHIFT); + bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT); + + buf = vzalloc(bufsize); + if (buf) + *buflen = bufsize; + + return buf; +} + /** * sd_zbc_report_zones - Disk report zones operation. * @disk: The target disk @@ -116,30 +164,23 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, struct blk_zone *zones, unsigned int *nr_zones) { struct scsi_disk *sdkp = scsi_disk(disk); - unsigned int i, buflen, nrz = *nr_zones; + unsigned int i, nrz = *nr_zones; unsigned char *buf; - size_t offset = 0; + size_t buflen = 0, offset = 0; int ret = 0; if (!sd_is_zoned(sdkp)) /* Not a zoned device */ return -EOPNOTSUPP; - /* - * Get a reply buffer for the number of requested zones plus a header, - * without exceeding the device maximum command size. For ATA disks, - * buffers must be aligned to 512B. - */ - buflen = min(queue_max_hw_sectors(disk->queue) << 9, - roundup((nrz + 1) * 64, 512)); - buf = kmalloc(buflen, GFP_KERNEL); + buf = sd_zbc_alloc_report_buffer(sdkp, nrz, &buflen); if (!buf) return -ENOMEM; ret = sd_zbc_do_report_zones(sdkp, buf, buflen, sectors_to_logical(sdkp->device, sector), true); if (ret) - goto out_free_buf; + goto out; nrz = min(nrz, get_unaligned_be32(&buf[0]) / 64); for (i = 0; i < nrz; i++) { @@ -150,8 +191,8 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, *nr_zones = nrz; -out_free_buf: - kfree(buf); +out: + kvfree(buf); return ret; } @@ -285,8 +326,6 @@ static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp, return 0; } -#define SD_ZBC_BUF_SIZE 131072U - /** * sd_zbc_check_zones - Check the device capacity and zone sizes * @sdkp: Target disk @@ -302,22 +341,28 @@ static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp, */ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks) { + size_t bufsize, buflen; + unsigned int noio_flag; u64 zone_blocks = 0; sector_t max_lba, block = 0; unsigned char *buf; unsigned char *rec; - unsigned int buf_len; - unsigned int list_length; int ret; u8 same; + /* Do all memory allocations as if GFP_NOIO was specified */ + noio_flag = memalloc_noio_save(); + /* Get a buffer */ - buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL); - if (!buf) - return -ENOMEM; + buf = sd_zbc_alloc_report_buffer(sdkp, SD_ZBC_REPORT_MAX_ZONES, + &bufsize); + if (!buf) { + ret = -ENOMEM; + goto out; + } /* Do a report zone to get max_lba and the same field */ - ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, 0, false); + ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, 0, false); if (ret) goto out_free; @@ -353,12 +398,12 @@ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks) do { /* Parse REPORT ZONES header */ - list_length = get_unaligned_be32(&buf[0]) + 64; + buflen = min_t(size_t, get_unaligned_be32(&buf[0]) + 64, + bufsize); rec = buf + 64; - buf_len = min(list_length, SD_ZBC_BUF_SIZE); /* Parse zone descriptors */ - while (rec < buf + buf_len) { + while (rec < buf + buflen) { u64 this_zone_blocks = get_unaligned_be64(&rec[8]); if (zone_blocks == 0) { @@ -374,8 +419,8 @@ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks) } if (block < sdkp->capacity) { - ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, - block, true); + ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, block, + true); if (ret) goto out_free; } @@ -406,7 +451,8 @@ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks) } out_free: - kfree(buf); + memalloc_noio_restore(noio_flag); + kvfree(buf); return ret; } From 26202928fafad8bda8b478edb7e62c885be623d7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 1 Jul 2019 14:09:18 +0900 Subject: [PATCH 40/42] block: Limit zone array allocation size Limit the size of the struct blk_zone array used in blk_revalidate_disk_zones() to avoid memory allocation failures leading to disk revalidation failure. Also further reduce the likelyhood of such failures by using kvcalloc() (that is vmalloc()) instead of allocating contiguous pages with alloc_pages(). Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation") Fixes: e76239a3748c ("block: add a report_zones method") Cc: stable@vger.kernel.org Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-zoned.c | 34 +++++++++++++++++++--------------- include/linux/blkdev.h | 5 +++++ 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 58ced170b424..6c503824ba3f 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include "blk.h" @@ -371,22 +373,25 @@ static inline unsigned long *blk_alloc_zone_bitmap(int node, * Allocate an array of struct blk_zone to get nr_zones zone information. * The allocated array may be smaller than nr_zones. */ -static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones) +static struct blk_zone *blk_alloc_zones(unsigned int *nr_zones) { - size_t size = *nr_zones * sizeof(struct blk_zone); - struct page *page; - int order; + struct blk_zone *zones; + size_t nrz = min(*nr_zones, BLK_ZONED_REPORT_MAX_ZONES); - for (order = get_order(size); order >= 0; order--) { - page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order); - if (page) { - *nr_zones = min_t(unsigned int, *nr_zones, - (PAGE_SIZE << order) / sizeof(struct blk_zone)); - return page_address(page); - } + /* + * GFP_KERNEL here is meaningless as the caller task context has + * the PF_MEMALLOC_NOIO flag set in blk_revalidate_disk_zones() + * with memalloc_noio_save(). + */ + zones = kvcalloc(nrz, sizeof(struct blk_zone), GFP_KERNEL); + if (!zones) { + *nr_zones = 0; + return NULL; } - return NULL; + *nr_zones = nrz; + + return zones; } void blk_queue_free_zone_bitmaps(struct request_queue *q) @@ -448,7 +453,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk) /* Get zone information and initialize seq_zones_bitmap */ rep_nr_zones = nr_zones; - zones = blk_alloc_zones(q->node, &rep_nr_zones); + zones = blk_alloc_zones(&rep_nr_zones); if (!zones) goto out; @@ -487,8 +492,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk) out: memalloc_noio_restore(noio_flag); - free_pages((unsigned long)zones, - get_order(rep_nr_zones * sizeof(struct blk_zone))); + kvfree(zones); kfree(seq_zones_wlock); kfree(seq_zones_bitmap); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 05036e3e3458..1ef375dafb1c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -344,6 +344,11 @@ struct queue_limits { #ifdef CONFIG_BLK_DEV_ZONED +/* + * Maximum number of zones to report with a single report zones command. + */ +#define BLK_ZONED_REPORT_MAX_ZONES 8192U + extern unsigned int blkdev_nr_zones(struct block_device *bdev); extern int blkdev_report_zones(struct block_device *bdev, sector_t sector, struct blk_zone *zones, From e347946439ed70e3af0d0c330b36d5648e71727b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 12 Jul 2019 09:09:57 -0600 Subject: [PATCH 41/42] null_blk: fixup ->report_zones() for !CONFIG_BLK_DEV_ZONED A previous commit changed the prototype, but didn't adjust the function for when zoned device support is disabled. Fix it up. Fixes: bd976e527259 ("block: Kill gfp_t argument of blkdev_report_zones()") Signed-off-by: Jens Axboe --- drivers/block/null_blk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index 4b9bbe3bb5a1..a1b9929bd911 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -102,7 +102,7 @@ static inline int null_zone_init(struct nullb_device *dev) static inline void null_zone_exit(struct nullb_device *dev) {} static inline int null_zone_report(struct gendisk *disk, sector_t sector, struct blk_zone *zones, - unsigned int *nr_zones, gfp_t gfp_mask) + unsigned int *nr_zones) { return -EOPNOTSUPP; } From 787c79d6393fc028887cc1b6066915f0b094e92f Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 11 Jul 2019 13:19:47 +0300 Subject: [PATCH 42/42] MAINTAINERS: add entry for block io cgroup This links mailing list cgroups@vger.kernel.org with related files. $ ./scripts/get_maintainer.pl -f block/blk-cgroup.c Jens Axboe (maintainer:BLOCK LAYER) cgroups@vger.kernel.org (open list:CONTROL GROUP - BLOCK IO CONTROLLER (BLKIO)) linux-block@vger.kernel.org (open list:BLOCK LAYER) linux-kernel@vger.kernel.org (open list) Signed-off-by: Konstantin Khlebnikov Added git tree/maintainer entries from Tejun. Signed-off-by: Jens Axboe --- MAINTAINERS | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index f9acdbac0001..2f138975d820 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4148,6 +4148,19 @@ S: Maintained F: mm/memcontrol.c F: mm/swap_cgroup.c +CONTROL GROUP - BLOCK IO CONTROLLER (BLKIO) +M: Tejun Heo +M: Jens Axboe +L: cgroups@vger.kernel.org +L: linux-block@vger.kernel.org +T: git git://git.kernel.dk/linux-block +F: Documentation/cgroup-v1/blkio-controller.rst +F: block/blk-cgroup.c +F: include/linux/blk-cgroup.h +F: block/blk-throttle.c +F: block/blk-iolatency.c +F: block/bfq-cgroup.c + CORETEMP HARDWARE MONITORING DRIVER M: Fenghua Yu L: linux-hwmon@vger.kernel.org