linux-brain/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

1973 lines
50 KiB
C
Raw Normal View History

2014-07-17 07:27:00 +09:00
/*
* Copyright 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include <linux/ratelimit.h>
#include <linux/printk.h>
2014-07-17 07:27:00 +09:00
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/types.h>
#include <linux/bitops.h>
#include <linux/sched.h>
2014-07-17 07:27:00 +09:00
#include "kfd_priv.h"
#include "kfd_device_queue_manager.h"
#include "kfd_mqd_manager.h"
#include "cik_regs.h"
#include "kfd_kernel_queue.h"
#include "amdgpu_amdkfd.h"
2014-07-17 07:27:00 +09:00
/* Size of the per-pipe EOP queue */
#define CIK_HPD_EOP_BYTES_LOG2 11
#define CIK_HPD_EOP_BYTES (1U << CIK_HPD_EOP_BYTES_LOG2)
static int set_pasid_vmid_mapping(struct device_queue_manager *dqm,
unsigned int pasid, unsigned int vmid);
static int execute_queues_cpsch(struct device_queue_manager *dqm,
enum kfd_unmap_queues_filter filter,
uint32_t filter_param);
static int unmap_queues_cpsch(struct device_queue_manager *dqm,
enum kfd_unmap_queues_filter filter,
uint32_t filter_param);
2014-07-17 07:27:00 +09:00
static int map_queues_cpsch(struct device_queue_manager *dqm);
static void deallocate_sdma_queue(struct device_queue_manager *dqm,
struct queue *q);
2014-07-17 07:27:00 +09:00
static inline void deallocate_hqd(struct device_queue_manager *dqm,
struct queue *q);
static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q);
static int allocate_sdma_queue(struct device_queue_manager *dqm,
struct queue *q);
static void kfd_process_hw_exception(struct work_struct *work);
static inline
enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
2014-07-17 07:27:00 +09:00
{
if (type == KFD_QUEUE_TYPE_SDMA || type == KFD_QUEUE_TYPE_SDMA_XGMI)
return KFD_MQD_TYPE_SDMA;
return KFD_MQD_TYPE_CP;
2014-07-17 07:27:00 +09:00
}
static bool is_pipe_enabled(struct device_queue_manager *dqm, int mec, int pipe)
{
int i;
int pipe_offset = mec * dqm->dev->shared_resources.num_pipe_per_mec
+ pipe * dqm->dev->shared_resources.num_queue_per_pipe;
/* queue is available for KFD usage if bit is 1 */
for (i = 0; i < dqm->dev->shared_resources.num_queue_per_pipe; ++i)
if (test_bit(pipe_offset + i,
dqm->dev->shared_resources.queue_bitmap))
return true;
return false;
}
unsigned int get_queues_num(struct device_queue_manager *dqm)
{
return bitmap_weight(dqm->dev->shared_resources.queue_bitmap,
KGD_MAX_QUEUES);
}
unsigned int get_queues_per_pipe(struct device_queue_manager *dqm)
2014-07-17 07:27:00 +09:00
{
return dqm->dev->shared_resources.num_queue_per_pipe;
}
unsigned int get_pipes_per_mec(struct device_queue_manager *dqm)
{
return dqm->dev->shared_resources.num_pipe_per_mec;
2014-07-17 07:27:00 +09:00
}
static unsigned int get_num_sdma_engines(struct device_queue_manager *dqm)
{
return dqm->dev->device_info->num_sdma_engines;
}
static unsigned int get_num_xgmi_sdma_engines(struct device_queue_manager *dqm)
{
return dqm->dev->device_info->num_xgmi_sdma_engines;
}
unsigned int get_num_sdma_queues(struct device_queue_manager *dqm)
{
return dqm->dev->device_info->num_sdma_engines
* dqm->dev->device_info->num_sdma_queues_per_engine;
}
unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm)
{
return dqm->dev->device_info->num_xgmi_sdma_engines
* dqm->dev->device_info->num_sdma_queues_per_engine;
}
void program_sh_mem_settings(struct device_queue_manager *dqm,
2014-07-17 07:27:00 +09:00
struct qcm_process_device *qpd)
{
return dqm->dev->kfd2kgd->program_sh_mem_settings(
dqm->dev->kgd, qpd->vmid,
2014-07-17 07:27:00 +09:00
qpd->sh_mem_config,
qpd->sh_mem_ape1_base,
qpd->sh_mem_ape1_limit,
qpd->sh_mem_bases);
}
static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q)
{
struct kfd_dev *dev = qpd->dqm->dev;
if (!KFD_IS_SOC15(dev->device_info->asic_family)) {
/* On pre-SOC15 chips we need to use the queue ID to
* preserve the user mode ABI.
*/
q->doorbell_id = q->properties.queue_id;
} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
/* For SDMA queues on SOC15 with 8-byte doorbell, use static
* doorbell assignments based on the engine and queue id.
* The doobell index distance between RLC (2*i) and (2*i+1)
* for a SDMA engine is 512.
*/
uint32_t *idx_offset =
dev->shared_resources.sdma_doorbell_idx;
q->doorbell_id = idx_offset[q->properties.sdma_engine_id]
+ (q->properties.sdma_queue_id & 1)
* KFD_QUEUE_DOORBELL_MIRROR_OFFSET
+ (q->properties.sdma_queue_id >> 1);
} else {
/* For CP queues on SOC15 reserve a free doorbell ID */
unsigned int found;
found = find_first_zero_bit(qpd->doorbell_bitmap,
KFD_MAX_NUM_OF_QUEUES_PER_PROCESS);
if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) {
pr_debug("No doorbells available");
return -EBUSY;
}
set_bit(found, qpd->doorbell_bitmap);
q->doorbell_id = found;
}
q->properties.doorbell_off =
kfd_doorbell_id_to_offset(dev, q->process,
q->doorbell_id);
return 0;
}
static void deallocate_doorbell(struct qcm_process_device *qpd,
struct queue *q)
{
unsigned int old;
struct kfd_dev *dev = qpd->dqm->dev;
if (!KFD_IS_SOC15(dev->device_info->asic_family) ||
q->properties.type == KFD_QUEUE_TYPE_SDMA ||
q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
return;
old = test_and_clear_bit(q->doorbell_id, qpd->doorbell_bitmap);
WARN_ON(!old);
}
2014-07-17 07:27:00 +09:00
static int allocate_vmid(struct device_queue_manager *dqm,
struct qcm_process_device *qpd,
struct queue *q)
{
int bit, allocated_vmid;
if (dqm->vmid_bitmap == 0)
return -ENOMEM;
bit = ffs(dqm->vmid_bitmap) - 1;
dqm->vmid_bitmap &= ~(1 << bit);
2014-07-17 07:27:00 +09:00
allocated_vmid = bit + dqm->dev->vm_info.first_vmid_kfd;
pr_debug("vmid allocation %d\n", allocated_vmid);
2014-07-17 07:27:00 +09:00
qpd->vmid = allocated_vmid;
q->properties.vmid = allocated_vmid;
set_pasid_vmid_mapping(dqm, q->process->pasid, q->properties.vmid);
program_sh_mem_settings(dqm, qpd);
/* qpd->page_table_base is set earlier when register_process()
* is called, i.e. when the first queue is created.
*/
dqm->dev->kfd2kgd->set_vm_context_page_table_base(dqm->dev->kgd,
qpd->vmid,
qpd->page_table_base);
/* invalidate the VM context after pasid and vmid mapping is set up */
kfd_flush_tlb(qpd_to_pdd(qpd));
dqm->dev->kfd2kgd->set_scratch_backing_va(
dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid);
2014-07-17 07:27:00 +09:00
return 0;
}
static int flush_texture_cache_nocpsch(struct kfd_dev *kdev,
struct qcm_process_device *qpd)
{
const struct packet_manager_funcs *pmf = qpd->dqm->packets.pmf;
int ret;
if (!qpd->ib_kaddr)
return -ENOMEM;
ret = pmf->release_mem(qpd->ib_base, (uint32_t *)qpd->ib_kaddr);
if (ret)
return ret;
return amdgpu_amdkfd_submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid,
qpd->ib_base, (uint32_t *)qpd->ib_kaddr,
pmf->release_mem_size / sizeof(uint32_t));
}
2014-07-17 07:27:00 +09:00
static void deallocate_vmid(struct device_queue_manager *dqm,
struct qcm_process_device *qpd,
struct queue *q)
{
int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd;
2014-07-17 07:27:00 +09:00
/* On GFX v7, CP doesn't flush TC at dequeue */
if (q->device->device_info->asic_family == CHIP_HAWAII)
if (flush_texture_cache_nocpsch(q->device, qpd))
pr_err("Failed to flush TC\n");
kfd_flush_tlb(qpd_to_pdd(qpd));
/* Release the vmid mapping */
set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
dqm->vmid_bitmap |= (1 << bit);
2014-07-17 07:27:00 +09:00
qpd->vmid = 0;
q->properties.vmid = 0;
}
static int create_queue_nocpsch(struct device_queue_manager *dqm,
struct queue *q,
struct qcm_process_device *qpd)
2014-07-17 07:27:00 +09:00
{
struct mqd_manager *mqd_mgr;
2014-07-17 07:27:00 +09:00
int retval;
print_queue(q);
dqm_lock(dqm);
2014-07-17 07:27:00 +09:00
drm/amdkfd: Allow user to limit only queues per device This patch replaces the two current amdkfd module parameters with a new one. The current parameters that are being replaced are: - Maximum number of HSA processes - Maximum number of queues per process The new parameter that replaces them is called "Maximum queues per device" This replacement achieves two goals: - Allows the user to have as many HSA processes as it wants (until a maximum of 512 HSA processes in Kaveri). - Removes the limitation the user had on maximum number of queues per HSA process. E.g. the user can now have processes which only have one queue and other processes which have hundreds of queues, while before the user couldn't have more than 128 queues per process (as default). The default value of the new parameter is 4096 (32 * 128, which were the defaults of the old parameters). There is almost no additional GART memory required for the default case. As a reminder, this amount of queues requires a little bit below 4MB of GART memory. v2: In addition, This patch defines a new counter for queues accounting in the DQM structure. This is done because the current counter only counts active queues which allows the user to create more queues than the max_num_of_queues_per_device module parameter allows. However, we need the current counter for the runlist packet build process, so the solution is to have a dedicated counter for this accounting. Signed-off-by: Oded Gabbay <oded.gabbay@amd.com> Reviewed-by: Ben Goz <ben.goz@amd.com>
2015-01-18 20:18:01 +09:00
if (dqm->total_queue_count >= max_num_of_queues_per_device) {
pr_warn("Can't create new usermode queue because %d queues were already created\n",
drm/amdkfd: Allow user to limit only queues per device This patch replaces the two current amdkfd module parameters with a new one. The current parameters that are being replaced are: - Maximum number of HSA processes - Maximum number of queues per process The new parameter that replaces them is called "Maximum queues per device" This replacement achieves two goals: - Allows the user to have as many HSA processes as it wants (until a maximum of 512 HSA processes in Kaveri). - Removes the limitation the user had on maximum number of queues per HSA process. E.g. the user can now have processes which only have one queue and other processes which have hundreds of queues, while before the user couldn't have more than 128 queues per process (as default). The default value of the new parameter is 4096 (32 * 128, which were the defaults of the old parameters). There is almost no additional GART memory required for the default case. As a reminder, this amount of queues requires a little bit below 4MB of GART memory. v2: In addition, This patch defines a new counter for queues accounting in the DQM structure. This is done because the current counter only counts active queues which allows the user to create more queues than the max_num_of_queues_per_device module parameter allows. However, we need the current counter for the runlist packet build process, so the solution is to have a dedicated counter for this accounting. Signed-off-by: Oded Gabbay <oded.gabbay@amd.com> Reviewed-by: Ben Goz <ben.goz@amd.com>
2015-01-18 20:18:01 +09:00
dqm->total_queue_count);
retval = -EPERM;
goto out_unlock;
drm/amdkfd: Allow user to limit only queues per device This patch replaces the two current amdkfd module parameters with a new one. The current parameters that are being replaced are: - Maximum number of HSA processes - Maximum number of queues per process The new parameter that replaces them is called "Maximum queues per device" This replacement achieves two goals: - Allows the user to have as many HSA processes as it wants (until a maximum of 512 HSA processes in Kaveri). - Removes the limitation the user had on maximum number of queues per HSA process. E.g. the user can now have processes which only have one queue and other processes which have hundreds of queues, while before the user couldn't have more than 128 queues per process (as default). The default value of the new parameter is 4096 (32 * 128, which were the defaults of the old parameters). There is almost no additional GART memory required for the default case. As a reminder, this amount of queues requires a little bit below 4MB of GART memory. v2: In addition, This patch defines a new counter for queues accounting in the DQM structure. This is done because the current counter only counts active queues which allows the user to create more queues than the max_num_of_queues_per_device module parameter allows. However, we need the current counter for the runlist packet build process, so the solution is to have a dedicated counter for this accounting. Signed-off-by: Oded Gabbay <oded.gabbay@amd.com> Reviewed-by: Ben Goz <ben.goz@amd.com>
2015-01-18 20:18:01 +09:00
}
2014-07-17 07:27:00 +09:00
if (list_empty(&qpd->queues_list)) {
retval = allocate_vmid(dqm, qpd, q);
if (retval)
goto out_unlock;
2014-07-17 07:27:00 +09:00
}
q->properties.vmid = qpd->vmid;
/*
* Eviction state logic: mark all queues as evicted, even ones
* not currently active. Restoring inactive queues later only
* updates the is_evicted flag but is a no-op otherwise.
*/
q->properties.is_evicted = !!qpd->evicted;
2014-07-17 07:27:00 +09:00
q->properties.tba_addr = qpd->tba_addr;
q->properties.tma_addr = qpd->tma_addr;
mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
q->properties.type)];
if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) {
retval = allocate_hqd(dqm, q);
if (retval)
goto deallocate_vmid;
pr_debug("Loading mqd to hqd on pipe %d, queue %d\n",
q->pipe, q->queue);
} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
retval = allocate_sdma_queue(dqm, q);
if (retval)
goto deallocate_vmid;
dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
}
retval = allocate_doorbell(qpd, q);
if (retval)
goto out_deallocate_hqd;
drm/amdkfd: Fix a circular lock dependency The idea to break the circular lock dependency is to temporarily drop dqm lock before calling allocate_mqd. See callstack #1 below. [ 59.510149] [drm] Initialized amdgpu 3.30.0 20150101 for 0000:04:00.0 on minor 0 [ 513.604034] ====================================================== [ 513.604205] WARNING: possible circular locking dependency detected [ 513.604375] 4.18.0-kfd-root #2 Tainted: G W [ 513.604530] ------------------------------------------------------ [ 513.604699] kswapd0/611 is trying to acquire lock: [ 513.604840] 00000000d254022e (&dqm->lock_hidden){+.+.}, at: evict_process_queues_nocpsch+0x26/0x140 [amdgpu] [ 513.605150] but task is already holding lock: [ 513.605307] 00000000961547fc (&anon_vma->rwsem){++++}, at: page_lock_anon_vma_read+0xe4/0x250 [ 513.605540] which lock already depends on the new lock. [ 513.605747] the existing dependency chain (in reverse order) is: [ 513.605944] -> #4 (&anon_vma->rwsem){++++}: [ 513.606106] __vma_adjust+0x147/0x7f0 [ 513.606231] __split_vma+0x179/0x190 [ 513.606353] mprotect_fixup+0x217/0x260 [ 513.606553] do_mprotect_pkey+0x211/0x380 [ 513.606752] __x64_sys_mprotect+0x1b/0x20 [ 513.606954] do_syscall_64+0x50/0x1a0 [ 513.607149] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 513.607380] -> #3 (&mapping->i_mmap_rwsem){++++}: [ 513.607678] rmap_walk_file+0x1f0/0x280 [ 513.607887] page_referenced+0xdd/0x180 [ 513.608081] shrink_page_list+0x853/0xcb0 [ 513.608279] shrink_inactive_list+0x33b/0x700 [ 513.608483] shrink_node_memcg+0x37a/0x7f0 [ 513.608682] shrink_node+0xd8/0x490 [ 513.608869] balance_pgdat+0x18b/0x3b0 [ 513.609062] kswapd+0x203/0x5c0 [ 513.609241] kthread+0x100/0x140 [ 513.609420] ret_from_fork+0x24/0x30 [ 513.609607] -> #2 (fs_reclaim){+.+.}: [ 513.609883] kmem_cache_alloc_trace+0x34/0x2e0 [ 513.610093] reservation_object_reserve_shared+0x139/0x300 [ 513.610326] ttm_bo_init_reserved+0x291/0x480 [ttm] [ 513.610567] amdgpu_bo_do_create+0x1d2/0x650 [amdgpu] [ 513.610811] amdgpu_bo_create+0x40/0x1f0 [amdgpu] [ 513.611041] amdgpu_bo_create_reserved+0x249/0x2d0 [amdgpu] [ 513.611290] amdgpu_bo_create_kernel+0x12/0x70 [amdgpu] [ 513.611584] amdgpu_ttm_init+0x2cb/0x560 [amdgpu] [ 513.611823] gmc_v9_0_sw_init+0x400/0x750 [amdgpu] [ 513.612491] amdgpu_device_init+0x14eb/0x1990 [amdgpu] [ 513.612730] amdgpu_driver_load_kms+0x78/0x290 [amdgpu] [ 513.612958] drm_dev_register+0x111/0x1a0 [ 513.613171] amdgpu_pci_probe+0x11c/0x1e0 [amdgpu] [ 513.613389] local_pci_probe+0x3f/0x90 [ 513.613581] pci_device_probe+0x102/0x1c0 [ 513.613779] driver_probe_device+0x2a7/0x480 [ 513.613984] __driver_attach+0x10a/0x110 [ 513.614179] bus_for_each_dev+0x67/0xc0 [ 513.614372] bus_add_driver+0x1eb/0x260 [ 513.614565] driver_register+0x5b/0xe0 [ 513.614756] do_one_initcall+0xac/0x357 [ 513.614952] do_init_module+0x5b/0x213 [ 513.615145] load_module+0x2542/0x2d30 [ 513.615337] __do_sys_finit_module+0xd2/0x100 [ 513.615541] do_syscall_64+0x50/0x1a0 [ 513.615731] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 513.615963] -> #1 (reservation_ww_class_mutex){+.+.}: [ 513.616293] amdgpu_amdkfd_alloc_gtt_mem+0xcf/0x2c0 [amdgpu] [ 513.616554] init_mqd+0x223/0x260 [amdgpu] [ 513.616779] create_queue_nocpsch+0x4d9/0x600 [amdgpu] [ 513.617031] pqm_create_queue+0x37c/0x520 [amdgpu] [ 513.617270] kfd_ioctl_create_queue+0x2f9/0x650 [amdgpu] [ 513.617522] kfd_ioctl+0x202/0x350 [amdgpu] [ 513.617724] do_vfs_ioctl+0x9f/0x6c0 [ 513.617914] ksys_ioctl+0x66/0x70 [ 513.618095] __x64_sys_ioctl+0x16/0x20 [ 513.618286] do_syscall_64+0x50/0x1a0 [ 513.618476] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 513.618695] -> #0 (&dqm->lock_hidden){+.+.}: [ 513.618984] __mutex_lock+0x98/0x970 [ 513.619197] evict_process_queues_nocpsch+0x26/0x140 [amdgpu] [ 513.619459] kfd_process_evict_queues+0x3b/0xb0 [amdgpu] [ 513.619710] kgd2kfd_quiesce_mm+0x1c/0x40 [amdgpu] [ 513.620103] amdgpu_amdkfd_evict_userptr+0x38/0x70 [amdgpu] [ 513.620363] amdgpu_mn_invalidate_range_start_hsa+0xa6/0xc0 [amdgpu] [ 513.620614] __mmu_notifier_invalidate_range_start+0x70/0xb0 [ 513.620851] try_to_unmap_one+0x7fc/0x8f0 [ 513.621049] rmap_walk_anon+0x121/0x290 [ 513.621242] try_to_unmap+0x93/0xf0 [ 513.621428] shrink_page_list+0x606/0xcb0 [ 513.621625] shrink_inactive_list+0x33b/0x700 [ 513.621835] shrink_node_memcg+0x37a/0x7f0 [ 513.622034] shrink_node+0xd8/0x490 [ 513.622219] balance_pgdat+0x18b/0x3b0 [ 513.622410] kswapd+0x203/0x5c0 [ 513.622589] kthread+0x100/0x140 [ 513.622769] ret_from_fork+0x24/0x30 [ 513.622957] other info that might help us debug this: [ 513.623354] Chain exists of: &dqm->lock_hidden --> &mapping->i_mmap_rwsem --> &anon_vma->rwsem [ 513.623900] Possible unsafe locking scenario: [ 513.624189] CPU0 CPU1 [ 513.624397] ---- ---- [ 513.624594] lock(&anon_vma->rwsem); [ 513.624771] lock(&mapping->i_mmap_rwsem); [ 513.625020] lock(&anon_vma->rwsem); [ 513.625253] lock(&dqm->lock_hidden); [ 513.625433] *** DEADLOCK *** [ 513.625783] 3 locks held by kswapd0/611: [ 513.625967] #0: 00000000f14edf84 (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x30 [ 513.626309] #1: 00000000961547fc (&anon_vma->rwsem){++++}, at: page_lock_anon_vma_read+0xe4/0x250 [ 513.626671] #2: 0000000067b5cd12 (srcu){....}, at: __mmu_notifier_invalidate_range_start+0x5/0xb0 [ 513.627037] stack backtrace: [ 513.627292] CPU: 0 PID: 611 Comm: kswapd0 Tainted: G W 4.18.0-kfd-root #2 [ 513.627632] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 513.627990] Call Trace: [ 513.628143] dump_stack+0x7c/0xbb [ 513.628315] print_circular_bug.isra.37+0x21b/0x228 [ 513.628581] __lock_acquire+0xf7d/0x1470 [ 513.628782] ? unwind_next_frame+0x6c/0x4f0 [ 513.628974] ? lock_acquire+0xec/0x1e0 [ 513.629154] lock_acquire+0xec/0x1e0 [ 513.629357] ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu] [ 513.629587] __mutex_lock+0x98/0x970 [ 513.629790] ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu] [ 513.630047] ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu] [ 513.630309] ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu] [ 513.630562] evict_process_queues_nocpsch+0x26/0x140 [amdgpu] [ 513.630816] kfd_process_evict_queues+0x3b/0xb0 [amdgpu] [ 513.631057] kgd2kfd_quiesce_mm+0x1c/0x40 [amdgpu] [ 513.631288] amdgpu_amdkfd_evict_userptr+0x38/0x70 [amdgpu] [ 513.631536] amdgpu_mn_invalidate_range_start_hsa+0xa6/0xc0 [amdgpu] [ 513.632076] __mmu_notifier_invalidate_range_start+0x70/0xb0 [ 513.632299] try_to_unmap_one+0x7fc/0x8f0 [ 513.632487] ? page_lock_anon_vma_read+0x68/0x250 [ 513.632690] rmap_walk_anon+0x121/0x290 [ 513.632875] try_to_unmap+0x93/0xf0 [ 513.633050] ? page_remove_rmap+0x330/0x330 [ 513.633239] ? rcu_read_unlock+0x60/0x60 [ 513.633422] ? page_get_anon_vma+0x160/0x160 [ 513.633613] shrink_page_list+0x606/0xcb0 [ 513.633800] shrink_inactive_list+0x33b/0x700 [ 513.633997] shrink_node_memcg+0x37a/0x7f0 [ 513.634186] ? shrink_node+0xd8/0x490 [ 513.634363] shrink_node+0xd8/0x490 [ 513.634537] balance_pgdat+0x18b/0x3b0 [ 513.634718] kswapd+0x203/0x5c0 [ 513.634887] ? wait_woken+0xb0/0xb0 [ 513.635062] kthread+0x100/0x140 [ 513.635231] ? balance_pgdat+0x3b0/0x3b0 [ 513.635414] ? kthread_delayed_work_timer_fn+0x80/0x80 [ 513.635626] ret_from_fork+0x24/0x30 [ 513.636042] Evicting PASID 32768 queues [ 513.936236] Restoring PASID 32768 queues [ 524.708912] Evicting PASID 32768 queues [ 524.999875] Restoring PASID 32768 queues Signed-off-by: Oak Zeng <Oak.Zeng@amd.com> Reviewed-by: Philip Yang <philip.yang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
2019-06-15 10:10:45 +09:00
/* Temporarily release dqm lock to avoid a circular lock dependency */
dqm_unlock(dqm);
q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr->dev, &q->properties);
drm/amdkfd: Fix a circular lock dependency The idea to break the circular lock dependency is to temporarily drop dqm lock before calling allocate_mqd. See callstack #1 below. [ 59.510149] [drm] Initialized amdgpu 3.30.0 20150101 for 0000:04:00.0 on minor 0 [ 513.604034] ====================================================== [ 513.604205] WARNING: possible circular locking dependency detected [ 513.604375] 4.18.0-kfd-root #2 Tainted: G W [ 513.604530] ------------------------------------------------------ [ 513.604699] kswapd0/611 is trying to acquire lock: [ 513.604840] 00000000d254022e (&dqm->lock_hidden){+.+.}, at: evict_process_queues_nocpsch+0x26/0x140 [amdgpu] [ 513.605150] but task is already holding lock: [ 513.605307] 00000000961547fc (&anon_vma->rwsem){++++}, at: page_lock_anon_vma_read+0xe4/0x250 [ 513.605540] which lock already depends on the new lock. [ 513.605747] the existing dependency chain (in reverse order) is: [ 513.605944] -> #4 (&anon_vma->rwsem){++++}: [ 513.606106] __vma_adjust+0x147/0x7f0 [ 513.606231] __split_vma+0x179/0x190 [ 513.606353] mprotect_fixup+0x217/0x260 [ 513.606553] do_mprotect_pkey+0x211/0x380 [ 513.606752] __x64_sys_mprotect+0x1b/0x20 [ 513.606954] do_syscall_64+0x50/0x1a0 [ 513.607149] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 513.607380] -> #3 (&mapping->i_mmap_rwsem){++++}: [ 513.607678] rmap_walk_file+0x1f0/0x280 [ 513.607887] page_referenced+0xdd/0x180 [ 513.608081] shrink_page_list+0x853/0xcb0 [ 513.608279] shrink_inactive_list+0x33b/0x700 [ 513.608483] shrink_node_memcg+0x37a/0x7f0 [ 513.608682] shrink_node+0xd8/0x490 [ 513.608869] balance_pgdat+0x18b/0x3b0 [ 513.609062] kswapd+0x203/0x5c0 [ 513.609241] kthread+0x100/0x140 [ 513.609420] ret_from_fork+0x24/0x30 [ 513.609607] -> #2 (fs_reclaim){+.+.}: [ 513.609883] kmem_cache_alloc_trace+0x34/0x2e0 [ 513.610093] reservation_object_reserve_shared+0x139/0x300 [ 513.610326] ttm_bo_init_reserved+0x291/0x480 [ttm] [ 513.610567] amdgpu_bo_do_create+0x1d2/0x650 [amdgpu] [ 513.610811] amdgpu_bo_create+0x40/0x1f0 [amdgpu] [ 513.611041] amdgpu_bo_create_reserved+0x249/0x2d0 [amdgpu] [ 513.611290] amdgpu_bo_create_kernel+0x12/0x70 [amdgpu] [ 513.611584] amdgpu_ttm_init+0x2cb/0x560 [amdgpu] [ 513.611823] gmc_v9_0_sw_init+0x400/0x750 [amdgpu] [ 513.612491] amdgpu_device_init+0x14eb/0x1990 [amdgpu] [ 513.612730] amdgpu_driver_load_kms+0x78/0x290 [amdgpu] [ 513.612958] drm_dev_register+0x111/0x1a0 [ 513.613171] amdgpu_pci_probe+0x11c/0x1e0 [amdgpu] [ 513.613389] local_pci_probe+0x3f/0x90 [ 513.613581] pci_device_probe+0x102/0x1c0 [ 513.613779] driver_probe_device+0x2a7/0x480 [ 513.613984] __driver_attach+0x10a/0x110 [ 513.614179] bus_for_each_dev+0x67/0xc0 [ 513.614372] bus_add_driver+0x1eb/0x260 [ 513.614565] driver_register+0x5b/0xe0 [ 513.614756] do_one_initcall+0xac/0x357 [ 513.614952] do_init_module+0x5b/0x213 [ 513.615145] load_module+0x2542/0x2d30 [ 513.615337] __do_sys_finit_module+0xd2/0x100 [ 513.615541] do_syscall_64+0x50/0x1a0 [ 513.615731] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 513.615963] -> #1 (reservation_ww_class_mutex){+.+.}: [ 513.616293] amdgpu_amdkfd_alloc_gtt_mem+0xcf/0x2c0 [amdgpu] [ 513.616554] init_mqd+0x223/0x260 [amdgpu] [ 513.616779] create_queue_nocpsch+0x4d9/0x600 [amdgpu] [ 513.617031] pqm_create_queue+0x37c/0x520 [amdgpu] [ 513.617270] kfd_ioctl_create_queue+0x2f9/0x650 [amdgpu] [ 513.617522] kfd_ioctl+0x202/0x350 [amdgpu] [ 513.617724] do_vfs_ioctl+0x9f/0x6c0 [ 513.617914] ksys_ioctl+0x66/0x70 [ 513.618095] __x64_sys_ioctl+0x16/0x20 [ 513.618286] do_syscall_64+0x50/0x1a0 [ 513.618476] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 513.618695] -> #0 (&dqm->lock_hidden){+.+.}: [ 513.618984] __mutex_lock+0x98/0x970 [ 513.619197] evict_process_queues_nocpsch+0x26/0x140 [amdgpu] [ 513.619459] kfd_process_evict_queues+0x3b/0xb0 [amdgpu] [ 513.619710] kgd2kfd_quiesce_mm+0x1c/0x40 [amdgpu] [ 513.620103] amdgpu_amdkfd_evict_userptr+0x38/0x70 [amdgpu] [ 513.620363] amdgpu_mn_invalidate_range_start_hsa+0xa6/0xc0 [amdgpu] [ 513.620614] __mmu_notifier_invalidate_range_start+0x70/0xb0 [ 513.620851] try_to_unmap_one+0x7fc/0x8f0 [ 513.621049] rmap_walk_anon+0x121/0x290 [ 513.621242] try_to_unmap+0x93/0xf0 [ 513.621428] shrink_page_list+0x606/0xcb0 [ 513.621625] shrink_inactive_list+0x33b/0x700 [ 513.621835] shrink_node_memcg+0x37a/0x7f0 [ 513.622034] shrink_node+0xd8/0x490 [ 513.622219] balance_pgdat+0x18b/0x3b0 [ 513.622410] kswapd+0x203/0x5c0 [ 513.622589] kthread+0x100/0x140 [ 513.622769] ret_from_fork+0x24/0x30 [ 513.622957] other info that might help us debug this: [ 513.623354] Chain exists of: &dqm->lock_hidden --> &mapping->i_mmap_rwsem --> &anon_vma->rwsem [ 513.623900] Possible unsafe locking scenario: [ 513.624189] CPU0 CPU1 [ 513.624397] ---- ---- [ 513.624594] lock(&anon_vma->rwsem); [ 513.624771] lock(&mapping->i_mmap_rwsem); [ 513.625020] lock(&anon_vma->rwsem); [ 513.625253] lock(&dqm->lock_hidden); [ 513.625433] *** DEADLOCK *** [ 513.625783] 3 locks held by kswapd0/611: [ 513.625967] #0: 00000000f14edf84 (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x30 [ 513.626309] #1: 00000000961547fc (&anon_vma->rwsem){++++}, at: page_lock_anon_vma_read+0xe4/0x250 [ 513.626671] #2: 0000000067b5cd12 (srcu){....}, at: __mmu_notifier_invalidate_range_start+0x5/0xb0 [ 513.627037] stack backtrace: [ 513.627292] CPU: 0 PID: 611 Comm: kswapd0 Tainted: G W 4.18.0-kfd-root #2 [ 513.627632] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 513.627990] Call Trace: [ 513.628143] dump_stack+0x7c/0xbb [ 513.628315] print_circular_bug.isra.37+0x21b/0x228 [ 513.628581] __lock_acquire+0xf7d/0x1470 [ 513.628782] ? unwind_next_frame+0x6c/0x4f0 [ 513.628974] ? lock_acquire+0xec/0x1e0 [ 513.629154] lock_acquire+0xec/0x1e0 [ 513.629357] ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu] [ 513.629587] __mutex_lock+0x98/0x970 [ 513.629790] ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu] [ 513.630047] ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu] [ 513.630309] ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu] [ 513.630562] evict_process_queues_nocpsch+0x26/0x140 [amdgpu] [ 513.630816] kfd_process_evict_queues+0x3b/0xb0 [amdgpu] [ 513.631057] kgd2kfd_quiesce_mm+0x1c/0x40 [amdgpu] [ 513.631288] amdgpu_amdkfd_evict_userptr+0x38/0x70 [amdgpu] [ 513.631536] amdgpu_mn_invalidate_range_start_hsa+0xa6/0xc0 [amdgpu] [ 513.632076] __mmu_notifier_invalidate_range_start+0x70/0xb0 [ 513.632299] try_to_unmap_one+0x7fc/0x8f0 [ 513.632487] ? page_lock_anon_vma_read+0x68/0x250 [ 513.632690] rmap_walk_anon+0x121/0x290 [ 513.632875] try_to_unmap+0x93/0xf0 [ 513.633050] ? page_remove_rmap+0x330/0x330 [ 513.633239] ? rcu_read_unlock+0x60/0x60 [ 513.633422] ? page_get_anon_vma+0x160/0x160 [ 513.633613] shrink_page_list+0x606/0xcb0 [ 513.633800] shrink_inactive_list+0x33b/0x700 [ 513.633997] shrink_node_memcg+0x37a/0x7f0 [ 513.634186] ? shrink_node+0xd8/0x490 [ 513.634363] shrink_node+0xd8/0x490 [ 513.634537] balance_pgdat+0x18b/0x3b0 [ 513.634718] kswapd+0x203/0x5c0 [ 513.634887] ? wait_woken+0xb0/0xb0 [ 513.635062] kthread+0x100/0x140 [ 513.635231] ? balance_pgdat+0x3b0/0x3b0 [ 513.635414] ? kthread_delayed_work_timer_fn+0x80/0x80 [ 513.635626] ret_from_fork+0x24/0x30 [ 513.636042] Evicting PASID 32768 queues [ 513.936236] Restoring PASID 32768 queues [ 524.708912] Evicting PASID 32768 queues [ 524.999875] Restoring PASID 32768 queues Signed-off-by: Oak Zeng <Oak.Zeng@amd.com> Reviewed-by: Philip Yang <philip.yang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
2019-06-15 10:10:45 +09:00
dqm_lock(dqm);
if (!q->mqd_mem_obj) {
retval = -ENOMEM;
goto out_deallocate_doorbell;
}
mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
&q->gart_mqd_addr, &q->properties);
if (q->properties.is_active) {
if (WARN(q->process->mm != current->mm,
"should only run in user thread"))
retval = -EFAULT;
else
retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
q->queue, &q->properties, current->mm);
if (retval)
goto out_free_mqd;
2014-07-17 07:27:00 +09:00
}
list_add(&q->list, &qpd->queues_list);
qpd->queue_count++;
if (q->properties.is_active)
dqm->queue_count++;
2014-07-17 07:27:00 +09:00
if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
dqm->sdma_queue_count++;
else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
dqm->xgmi_sdma_queue_count++;
2014-07-17 07:27:00 +09:00
drm/amdkfd: Allow user to limit only queues per device This patch replaces the two current amdkfd module parameters with a new one. The current parameters that are being replaced are: - Maximum number of HSA processes - Maximum number of queues per process The new parameter that replaces them is called "Maximum queues per device" This replacement achieves two goals: - Allows the user to have as many HSA processes as it wants (until a maximum of 512 HSA processes in Kaveri). - Removes the limitation the user had on maximum number of queues per HSA process. E.g. the user can now have processes which only have one queue and other processes which have hundreds of queues, while before the user couldn't have more than 128 queues per process (as default). The default value of the new parameter is 4096 (32 * 128, which were the defaults of the old parameters). There is almost no additional GART memory required for the default case. As a reminder, this amount of queues requires a little bit below 4MB of GART memory. v2: In addition, This patch defines a new counter for queues accounting in the DQM structure. This is done because the current counter only counts active queues which allows the user to create more queues than the max_num_of_queues_per_device module parameter allows. However, we need the current counter for the runlist packet build process, so the solution is to have a dedicated counter for this accounting. Signed-off-by: Oded Gabbay <oded.gabbay@amd.com> Reviewed-by: Ben Goz <ben.goz@amd.com>
2015-01-18 20:18:01 +09:00
/*
* Unconditionally increment this counter, regardless of the queue's
* type or whether the queue is active.
*/
dqm->total_queue_count++;
pr_debug("Total of %d queues are accountable so far\n",
dqm->total_queue_count);
goto out_unlock;
drm/amdkfd: Allow user to limit only queues per device This patch replaces the two current amdkfd module parameters with a new one. The current parameters that are being replaced are: - Maximum number of HSA processes - Maximum number of queues per process The new parameter that replaces them is called "Maximum queues per device" This replacement achieves two goals: - Allows the user to have as many HSA processes as it wants (until a maximum of 512 HSA processes in Kaveri). - Removes the limitation the user had on maximum number of queues per HSA process. E.g. the user can now have processes which only have one queue and other processes which have hundreds of queues, while before the user couldn't have more than 128 queues per process (as default). The default value of the new parameter is 4096 (32 * 128, which were the defaults of the old parameters). There is almost no additional GART memory required for the default case. As a reminder, this amount of queues requires a little bit below 4MB of GART memory. v2: In addition, This patch defines a new counter for queues accounting in the DQM structure. This is done because the current counter only counts active queues which allows the user to create more queues than the max_num_of_queues_per_device module parameter allows. However, we need the current counter for the runlist packet build process, so the solution is to have a dedicated counter for this accounting. Signed-off-by: Oded Gabbay <oded.gabbay@amd.com> Reviewed-by: Ben Goz <ben.goz@amd.com>
2015-01-18 20:18:01 +09:00
out_free_mqd:
mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
out_deallocate_doorbell:
deallocate_doorbell(qpd, q);
out_deallocate_hqd:
if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
deallocate_hqd(dqm, q);
else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
deallocate_sdma_queue(dqm, q);
deallocate_vmid:
if (list_empty(&qpd->queues_list))
deallocate_vmid(dqm, qpd, q);
out_unlock:
dqm_unlock(dqm);
return retval;
2014-07-17 07:27:00 +09:00
}
static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q)
{
bool set;
int pipe, bit, i;
2014-07-17 07:27:00 +09:00
set = false;
for (pipe = dqm->next_pipe_to_allocate, i = 0;
i < get_pipes_per_mec(dqm);
pipe = ((pipe + 1) % get_pipes_per_mec(dqm)), ++i) {
if (!is_pipe_enabled(dqm, 0, pipe))
continue;
2014-07-17 07:27:00 +09:00
if (dqm->allocated_queues[pipe] != 0) {
bit = ffs(dqm->allocated_queues[pipe]) - 1;
dqm->allocated_queues[pipe] &= ~(1 << bit);
2014-07-17 07:27:00 +09:00
q->pipe = pipe;
q->queue = bit;
set = true;
break;
}
}
if (!set)
2014-07-17 07:27:00 +09:00
return -EBUSY;
pr_debug("hqd slot - pipe %d, queue %d\n", q->pipe, q->queue);
2014-07-17 07:27:00 +09:00
/* horizontal hqd allocation */
dqm->next_pipe_to_allocate = (pipe + 1) % get_pipes_per_mec(dqm);
2014-07-17 07:27:00 +09:00
return 0;
}
static inline void deallocate_hqd(struct device_queue_manager *dqm,
struct queue *q)
{
dqm->allocated_queues[q->pipe] |= (1 << q->queue);
2014-07-17 07:27:00 +09:00
}
/* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked
* to avoid asynchronized access
*/
static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
2014-07-17 07:27:00 +09:00
struct qcm_process_device *qpd,
struct queue *q)
{
int retval;
struct mqd_manager *mqd_mgr;
2014-07-17 07:27:00 +09:00
mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
q->properties.type)];
2014-07-17 07:27:00 +09:00
if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) {
deallocate_hqd(dqm, q);
} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
dqm->sdma_queue_count--;
deallocate_sdma_queue(dqm, q);
} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
dqm->xgmi_sdma_queue_count--;
deallocate_sdma_queue(dqm, q);
} else {
pr_debug("q->properties.type %d is invalid\n",
q->properties.type);
return -EINVAL;
2014-07-17 07:27:00 +09:00
}
dqm->total_queue_count--;
2014-07-17 07:27:00 +09:00
deallocate_doorbell(qpd, q);
retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
KFD_UNMAP_LATENCY_MS,
2014-07-17 07:27:00 +09:00
q->pipe, q->queue);
if (retval == -ETIME)
qpd->reset_wavefronts = true;
2014-07-17 07:27:00 +09:00
mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
2014-07-17 07:27:00 +09:00
list_del(&q->list);
if (list_empty(&qpd->queues_list)) {
if (qpd->reset_wavefronts) {
pr_warn("Resetting wave fronts (nocpsch) on dev %p\n",
dqm->dev);
/* dbgdev_wave_reset_wavefronts has to be called before
* deallocate_vmid(), i.e. when vmid is still in use.
*/
dbgdev_wave_reset_wavefronts(dqm->dev,
qpd->pqm->process);
qpd->reset_wavefronts = false;
}
2014-07-17 07:27:00 +09:00
deallocate_vmid(dqm, qpd, q);
}
qpd->queue_count--;
if (q->properties.is_active)
dqm->queue_count--;
drm/amdkfd: Allow user to limit only queues per device This patch replaces the two current amdkfd module parameters with a new one. The current parameters that are being replaced are: - Maximum number of HSA processes - Maximum number of queues per process The new parameter that replaces them is called "Maximum queues per device" This replacement achieves two goals: - Allows the user to have as many HSA processes as it wants (until a maximum of 512 HSA processes in Kaveri). - Removes the limitation the user had on maximum number of queues per HSA process. E.g. the user can now have processes which only have one queue and other processes which have hundreds of queues, while before the user couldn't have more than 128 queues per process (as default). The default value of the new parameter is 4096 (32 * 128, which were the defaults of the old parameters). There is almost no additional GART memory required for the default case. As a reminder, this amount of queues requires a little bit below 4MB of GART memory. v2: In addition, This patch defines a new counter for queues accounting in the DQM structure. This is done because the current counter only counts active queues which allows the user to create more queues than the max_num_of_queues_per_device module parameter allows. However, we need the current counter for the runlist packet build process, so the solution is to have a dedicated counter for this accounting. Signed-off-by: Oded Gabbay <oded.gabbay@amd.com> Reviewed-by: Ben Goz <ben.goz@amd.com>
2015-01-18 20:18:01 +09:00
return retval;
}
drm/amdkfd: Allow user to limit only queues per device This patch replaces the two current amdkfd module parameters with a new one. The current parameters that are being replaced are: - Maximum number of HSA processes - Maximum number of queues per process The new parameter that replaces them is called "Maximum queues per device" This replacement achieves two goals: - Allows the user to have as many HSA processes as it wants (until a maximum of 512 HSA processes in Kaveri). - Removes the limitation the user had on maximum number of queues per HSA process. E.g. the user can now have processes which only have one queue and other processes which have hundreds of queues, while before the user couldn't have more than 128 queues per process (as default). The default value of the new parameter is 4096 (32 * 128, which were the defaults of the old parameters). There is almost no additional GART memory required for the default case. As a reminder, this amount of queues requires a little bit below 4MB of GART memory. v2: In addition, This patch defines a new counter for queues accounting in the DQM structure. This is done because the current counter only counts active queues which allows the user to create more queues than the max_num_of_queues_per_device module parameter allows. However, we need the current counter for the runlist packet build process, so the solution is to have a dedicated counter for this accounting. Signed-off-by: Oded Gabbay <oded.gabbay@amd.com> Reviewed-by: Ben Goz <ben.goz@amd.com>
2015-01-18 20:18:01 +09:00
static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd,
struct queue *q)
{
int retval;
dqm_lock(dqm);
retval = destroy_queue_nocpsch_locked(dqm, qpd, q);
dqm_unlock(dqm);
2014-07-17 07:27:00 +09:00
return retval;
}
static int update_queue(struct device_queue_manager *dqm, struct queue *q)
{
int retval = 0;
struct mqd_manager *mqd_mgr;
struct kfd_process_device *pdd;
bool prev_active = false;
2014-07-17 07:27:00 +09:00
dqm_lock(dqm);
pdd = kfd_get_process_device_data(q->device, q->process);
if (!pdd) {
retval = -ENODEV;
goto out_unlock;
}
mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
q->properties.type)];
2014-07-17 07:27:00 +09:00
/* Save previous activity state for counters */
prev_active = q->properties.is_active;
/* Make sure the queue is unmapped before updating the MQD */
if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) {
retval = unmap_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
if (retval) {
pr_err("unmap queue failed\n");
goto out_unlock;
}
} else if (prev_active &&
(q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
q->properties.type == KFD_QUEUE_TYPE_SDMA ||
q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
if (retval) {
pr_err("destroy mqd failed\n");
goto out_unlock;
}
}
mqd_mgr->update_mqd(mqd_mgr, q->mqd, &q->properties);
/*
* check active state vs. the previous state and modify
* counter accordingly. map_queues_cpsch uses the
* dqm->queue_count to determine whether a new runlist must be
* uploaded.
*/
if (q->properties.is_active && !prev_active)
dqm->queue_count++;
else if (!q->properties.is_active && prev_active)
dqm->queue_count--;
if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS)
retval = map_queues_cpsch(dqm);
else if (q->properties.is_active &&
(q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
q->properties.type == KFD_QUEUE_TYPE_SDMA ||
q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
if (WARN(q->process->mm != current->mm,
"should only run in user thread"))
retval = -EFAULT;
else
retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd,
q->pipe, q->queue,
&q->properties, current->mm);
}
out_unlock:
dqm_unlock(dqm);
2014-07-17 07:27:00 +09:00
return retval;
}
static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
struct queue *q;
struct mqd_manager *mqd_mgr;
struct kfd_process_device *pdd;
int retval, ret = 0;
dqm_lock(dqm);
if (qpd->evicted++ > 0) /* already evicted, do nothing */
goto out;
pdd = qpd_to_pdd(qpd);
pr_info_ratelimited("Evicting PASID %u queues\n",
pdd->process->pasid);
/* Mark all queues as evicted. Deactivate all active queues on
* the qpd.
*/
list_for_each_entry(q, &qpd->queues_list, list) {
q->properties.is_evicted = true;
if (!q->properties.is_active)
continue;
mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
q->properties.type)];
q->properties.is_active = false;
retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
if (retval && !ret)
/* Return the first error, but keep going to
* maintain a consistent eviction state
*/
ret = retval;
dqm->queue_count--;
}
out:
dqm_unlock(dqm);
return ret;
}
static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
struct queue *q;
struct kfd_process_device *pdd;
int retval = 0;
dqm_lock(dqm);
if (qpd->evicted++ > 0) /* already evicted, do nothing */
goto out;
pdd = qpd_to_pdd(qpd);
pr_info_ratelimited("Evicting PASID %u queues\n",
pdd->process->pasid);
/* Mark all queues as evicted. Deactivate all active queues on
* the qpd.
*/
list_for_each_entry(q, &qpd->queues_list, list) {
q->properties.is_evicted = true;
if (!q->properties.is_active)
continue;
q->properties.is_active = false;
dqm->queue_count--;
}
retval = execute_queues_cpsch(dqm,
qpd->is_debug ?
KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
out:
dqm_unlock(dqm);
return retval;
}
static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
struct mm_struct *mm = NULL;
struct queue *q;
struct mqd_manager *mqd_mgr;
struct kfd_process_device *pdd;
uint64_t pd_base;
int retval, ret = 0;
pdd = qpd_to_pdd(qpd);
/* Retrieve PD base */
pd_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->vm);
dqm_lock(dqm);
if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
goto out;
if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */
qpd->evicted--;
goto out;
}
pr_info_ratelimited("Restoring PASID %u queues\n",
pdd->process->pasid);
/* Update PD Base in QPD */
qpd->page_table_base = pd_base;
pr_debug("Updated PD address to 0x%llx\n", pd_base);
if (!list_empty(&qpd->queues_list)) {
dqm->dev->kfd2kgd->set_vm_context_page_table_base(
dqm->dev->kgd,
qpd->vmid,
qpd->page_table_base);
kfd_flush_tlb(pdd);
}
/* Take a safe reference to the mm_struct, which may otherwise
* disappear even while the kfd_process is still referenced.
*/
mm = get_task_mm(pdd->process->lead_thread);
if (!mm) {
ret = -EFAULT;
goto out;
}
/* Remove the eviction flags. Activate queues that are not
* inactive for other reasons.
*/
list_for_each_entry(q, &qpd->queues_list, list) {
q->properties.is_evicted = false;
if (!QUEUE_IS_ACTIVE(q->properties))
continue;
mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
q->properties.type)];
q->properties.is_active = true;
retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
q->queue, &q->properties, mm);
if (retval && !ret)
/* Return the first error, but keep going to
* maintain a consistent eviction state
*/
ret = retval;
dqm->queue_count++;
}
qpd->evicted = 0;
out:
if (mm)
mmput(mm);
dqm_unlock(dqm);
return ret;
}
static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
struct queue *q;
struct kfd_process_device *pdd;
uint64_t pd_base;
int retval = 0;
pdd = qpd_to_pdd(qpd);
/* Retrieve PD base */
pd_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->vm);
dqm_lock(dqm);
if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
goto out;
if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */
qpd->evicted--;
goto out;
}
pr_info_ratelimited("Restoring PASID %u queues\n",
pdd->process->pasid);
/* Update PD Base in QPD */
qpd->page_table_base = pd_base;
pr_debug("Updated PD address to 0x%llx\n", pd_base);
/* activate all active queues on the qpd */
list_for_each_entry(q, &qpd->queues_list, list) {
q->properties.is_evicted = false;
if (!QUEUE_IS_ACTIVE(q->properties))
continue;
q->properties.is_active = true;
dqm->queue_count++;
}
retval = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
qpd->evicted = 0;
out:
dqm_unlock(dqm);
return retval;
}
static int register_process(struct device_queue_manager *dqm,
2014-07-17 07:27:00 +09:00
struct qcm_process_device *qpd)
{
struct device_process_node *n;
struct kfd_process_device *pdd;
uint64_t pd_base;
int retval;
2014-07-17 07:27:00 +09:00
n = kzalloc(sizeof(*n), GFP_KERNEL);
2014-07-17 07:27:00 +09:00
if (!n)
return -ENOMEM;
n->qpd = qpd;
pdd = qpd_to_pdd(qpd);
/* Retrieve PD base */
pd_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->vm);
dqm_lock(dqm);
2014-07-17 07:27:00 +09:00
list_add(&n->list, &dqm->queues);
/* Update PD Base in QPD */
qpd->page_table_base = pd_base;
pr_debug("Updated PD address to 0x%llx\n", pd_base);
retval = dqm->asic_ops.update_qpd(dqm, qpd);
dqm->processes_count++;
2014-07-17 07:27:00 +09:00
dqm_unlock(dqm);
2014-07-17 07:27:00 +09:00
/* Outside the DQM lock because under the DQM lock we can't do
* reclaim or take other locks that others hold while reclaiming.
*/
kfd_inc_compute_active(dqm->dev);
return retval;
2014-07-17 07:27:00 +09:00
}
static int unregister_process(struct device_queue_manager *dqm,
2014-07-17 07:27:00 +09:00
struct qcm_process_device *qpd)
{
int retval;
struct device_process_node *cur, *next;
pr_debug("qpd->queues_list is %s\n",
list_empty(&qpd->queues_list) ? "empty" : "not empty");
2014-07-17 07:27:00 +09:00
retval = 0;
dqm_lock(dqm);
2014-07-17 07:27:00 +09:00
list_for_each_entry_safe(cur, next, &dqm->queues, list) {
if (qpd == cur->qpd) {
list_del(&cur->list);
kfree(cur);
dqm->processes_count--;
2014-07-17 07:27:00 +09:00
goto out;
}
}
/* qpd not found in dqm list */
retval = 1;
out:
dqm_unlock(dqm);
/* Outside the DQM lock because under the DQM lock we can't do
* reclaim or take other locks that others hold while reclaiming.
*/
if (!retval)
kfd_dec_compute_active(dqm->dev);
2014-07-17 07:27:00 +09:00
return retval;
}
static int
set_pasid_vmid_mapping(struct device_queue_manager *dqm, unsigned int pasid,
unsigned int vmid)
{
return dqm->dev->kfd2kgd->set_pasid_vmid_mapping(
dqm->dev->kgd, pasid, vmid);
2014-07-17 07:27:00 +09:00
}
static void init_interrupts(struct device_queue_manager *dqm)
{
unsigned int i;
for (i = 0 ; i < get_pipes_per_mec(dqm) ; i++)
if (is_pipe_enabled(dqm, 0, i))
dqm->dev->kfd2kgd->init_interrupts(dqm->dev->kgd, i);
}
2014-07-17 07:27:00 +09:00
static int initialize_nocpsch(struct device_queue_manager *dqm)
{
int pipe, queue;
2014-07-17 07:27:00 +09:00
pr_debug("num of pipes: %d\n", get_pipes_per_mec(dqm));
2014-07-17 07:27:00 +09:00
dqm->allocated_queues = kcalloc(get_pipes_per_mec(dqm),
sizeof(unsigned int), GFP_KERNEL);
if (!dqm->allocated_queues)
return -ENOMEM;
mutex_init(&dqm->lock_hidden);
2014-07-17 07:27:00 +09:00
INIT_LIST_HEAD(&dqm->queues);
dqm->queue_count = dqm->next_pipe_to_allocate = 0;
dqm->sdma_queue_count = 0;
dqm->xgmi_sdma_queue_count = 0;
2014-07-17 07:27:00 +09:00
for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) {
int pipe_offset = pipe * get_queues_per_pipe(dqm);
for (queue = 0; queue < get_queues_per_pipe(dqm); queue++)
if (test_bit(pipe_offset + queue,
dqm->dev->shared_resources.queue_bitmap))
dqm->allocated_queues[pipe] |= 1 << queue;
}
2014-07-17 07:27:00 +09:00
dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1;
dqm->sdma_bitmap = ~0ULL >> (64 - get_num_sdma_queues(dqm));
dqm->xgmi_sdma_bitmap = ~0ULL >> (64 - get_num_xgmi_sdma_queues(dqm));
2014-07-17 07:27:00 +09:00
return 0;
}
static void uninitialize(struct device_queue_manager *dqm)
2014-07-17 07:27:00 +09:00
{
int i;
WARN_ON(dqm->queue_count > 0 || dqm->processes_count > 0);
2014-07-17 07:27:00 +09:00
kfree(dqm->allocated_queues);
for (i = 0 ; i < KFD_MQD_TYPE_MAX ; i++)
kfree(dqm->mqd_mgrs[i]);
mutex_destroy(&dqm->lock_hidden);
kfd_gtt_sa_free(dqm->dev, dqm->pipeline_mem);
2014-07-17 07:27:00 +09:00
}
static int start_nocpsch(struct device_queue_manager *dqm)
{
init_interrupts(dqm);
return pm_init(&dqm->packets, dqm);
2014-07-17 07:27:00 +09:00
}
static int stop_nocpsch(struct device_queue_manager *dqm)
{
pm_uninit(&dqm->packets);
2014-07-17 07:27:00 +09:00
return 0;
}
static int allocate_sdma_queue(struct device_queue_manager *dqm,
struct queue *q)
{
int bit;
if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
if (dqm->sdma_bitmap == 0)
return -ENOMEM;
bit = __ffs64(dqm->sdma_bitmap);
dqm->sdma_bitmap &= ~(1ULL << bit);
q->sdma_id = bit;
q->properties.sdma_engine_id = q->sdma_id %
get_num_sdma_engines(dqm);
q->properties.sdma_queue_id = q->sdma_id /
get_num_sdma_engines(dqm);
} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
if (dqm->xgmi_sdma_bitmap == 0)
return -ENOMEM;
bit = __ffs64(dqm->xgmi_sdma_bitmap);
dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
q->sdma_id = bit;
/* sdma_engine_id is sdma id including
* both PCIe-optimized SDMAs and XGMI-
* optimized SDMAs. The calculation below
* assumes the first N engines are always
* PCIe-optimized ones
*/
q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
q->sdma_id % get_num_xgmi_sdma_engines(dqm);
q->properties.sdma_queue_id = q->sdma_id /
get_num_xgmi_sdma_engines(dqm);
}
pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id);
return 0;
}
static void deallocate_sdma_queue(struct device_queue_manager *dqm,
struct queue *q)
{
if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
if (q->sdma_id >= get_num_sdma_queues(dqm))
return;
dqm->sdma_bitmap |= (1ULL << q->sdma_id);
} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
return;
dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
}
}
2014-07-17 07:27:00 +09:00
/*
* Device Queue Manager implementation for cp scheduler
*/
static int set_sched_resources(struct device_queue_manager *dqm)
{
int i, mec;
2014-07-17 07:27:00 +09:00
struct scheduling_resources res;
res.vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap;
res.queue_mask = 0;
for (i = 0; i < KGD_MAX_QUEUES; ++i) {
mec = (i / dqm->dev->shared_resources.num_queue_per_pipe)
/ dqm->dev->shared_resources.num_pipe_per_mec;
if (!test_bit(i, dqm->dev->shared_resources.queue_bitmap))
continue;
/* only acquire queues from the first MEC */
if (mec > 0)
continue;
/* This situation may be hit in the future if a new HW
* generation exposes more than 64 queues. If so, the
* definition of res.queue_mask needs updating
*/
if (WARN_ON(i >= (sizeof(res.queue_mask)*8))) {
pr_err("Invalid queue enabled by amdgpu: %d\n", i);
break;
}
res.queue_mask |= (1ull << i);
}
res.gws_mask = ~0ull;
res.oac_mask = res.gds_heap_base = res.gds_heap_size = 0;
2014-07-17 07:27:00 +09:00
pr_debug("Scheduling resources:\n"
"vmid mask: 0x%8X\n"
"queue mask: 0x%8llX\n",
2014-07-17 07:27:00 +09:00
res.vmid_mask, res.queue_mask);
return pm_send_set_resources(&dqm->packets, &res);
}
static int initialize_cpsch(struct device_queue_manager *dqm)
{
uint64_t num_sdma_queues;
uint64_t num_xgmi_sdma_queues;
pr_debug("num of pipes: %d\n", get_pipes_per_mec(dqm));
2014-07-17 07:27:00 +09:00
mutex_init(&dqm->lock_hidden);
2014-07-17 07:27:00 +09:00
INIT_LIST_HEAD(&dqm->queues);
dqm->queue_count = dqm->processes_count = 0;
dqm->sdma_queue_count = 0;
dqm->xgmi_sdma_queue_count = 0;
2014-07-17 07:27:00 +09:00
dqm->active_runlist = false;
num_sdma_queues = get_num_sdma_queues(dqm);
if (num_sdma_queues >= BITS_PER_TYPE(dqm->sdma_bitmap))
dqm->sdma_bitmap = ULLONG_MAX;
else
dqm->sdma_bitmap = (BIT_ULL(num_sdma_queues) - 1);
num_xgmi_sdma_queues = get_num_xgmi_sdma_queues(dqm);
if (num_xgmi_sdma_queues >= BITS_PER_TYPE(dqm->xgmi_sdma_bitmap))
dqm->xgmi_sdma_bitmap = ULLONG_MAX;
else
dqm->xgmi_sdma_bitmap = (BIT_ULL(num_xgmi_sdma_queues) - 1);
2014-07-17 07:27:00 +09:00
INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception);
return 0;
2014-07-17 07:27:00 +09:00
}
static int start_cpsch(struct device_queue_manager *dqm)
{
int retval;
retval = 0;
retval = pm_init(&dqm->packets, dqm);
if (retval)
2014-07-17 07:27:00 +09:00
goto fail_packet_manager_init;
retval = set_sched_resources(dqm);
if (retval)
2014-07-17 07:27:00 +09:00
goto fail_set_sched_resources;
pr_debug("Allocating fence memory\n");
2014-07-17 07:27:00 +09:00
/* allocate fence memory on the gart */
retval = kfd_gtt_sa_allocate(dqm->dev, sizeof(*dqm->fence_addr),
&dqm->fence_mem);
2014-07-17 07:27:00 +09:00
if (retval)
2014-07-17 07:27:00 +09:00
goto fail_allocate_vidmem;
dqm->fence_addr = dqm->fence_mem->cpu_ptr;
dqm->fence_gpu_addr = dqm->fence_mem->gpu_addr;
init_interrupts(dqm);
dqm_lock(dqm);
/* clear hang status when driver try to start the hw scheduler */
dqm->is_hws_hang = false;
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
dqm_unlock(dqm);
2014-07-17 07:27:00 +09:00
return 0;
fail_allocate_vidmem:
fail_set_sched_resources:
pm_uninit(&dqm->packets);
fail_packet_manager_init:
return retval;
}
static int stop_cpsch(struct device_queue_manager *dqm)
{
dqm_lock(dqm);
unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
dqm_unlock(dqm);
2014-07-17 07:27:00 +09:00
pm_release_ib(&dqm->packets);
kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
2014-07-17 07:27:00 +09:00
pm_uninit(&dqm->packets);
return 0;
}
static int create_kernel_queue_cpsch(struct device_queue_manager *dqm,
struct kernel_queue *kq,
struct qcm_process_device *qpd)
{
dqm_lock(dqm);
drm/amdkfd: Allow user to limit only queues per device This patch replaces the two current amdkfd module parameters with a new one. The current parameters that are being replaced are: - Maximum number of HSA processes - Maximum number of queues per process The new parameter that replaces them is called "Maximum queues per device" This replacement achieves two goals: - Allows the user to have as many HSA processes as it wants (until a maximum of 512 HSA processes in Kaveri). - Removes the limitation the user had on maximum number of queues per HSA process. E.g. the user can now have processes which only have one queue and other processes which have hundreds of queues, while before the user couldn't have more than 128 queues per process (as default). The default value of the new parameter is 4096 (32 * 128, which were the defaults of the old parameters). There is almost no additional GART memory required for the default case. As a reminder, this amount of queues requires a little bit below 4MB of GART memory. v2: In addition, This patch defines a new counter for queues accounting in the DQM structure. This is done because the current counter only counts active queues which allows the user to create more queues than the max_num_of_queues_per_device module parameter allows. However, we need the current counter for the runlist packet build process, so the solution is to have a dedicated counter for this accounting. Signed-off-by: Oded Gabbay <oded.gabbay@amd.com> Reviewed-by: Ben Goz <ben.goz@amd.com>
2015-01-18 20:18:01 +09:00
if (dqm->total_queue_count >= max_num_of_queues_per_device) {
pr_warn("Can't create new kernel queue because %d queues were already created\n",
drm/amdkfd: Allow user to limit only queues per device This patch replaces the two current amdkfd module parameters with a new one. The current parameters that are being replaced are: - Maximum number of HSA processes - Maximum number of queues per process The new parameter that replaces them is called "Maximum queues per device" This replacement achieves two goals: - Allows the user to have as many HSA processes as it wants (until a maximum of 512 HSA processes in Kaveri). - Removes the limitation the user had on maximum number of queues per HSA process. E.g. the user can now have processes which only have one queue and other processes which have hundreds of queues, while before the user couldn't have more than 128 queues per process (as default). The default value of the new parameter is 4096 (32 * 128, which were the defaults of the old parameters). There is almost no additional GART memory required for the default case. As a reminder, this amount of queues requires a little bit below 4MB of GART memory. v2: In addition, This patch defines a new counter for queues accounting in the DQM structure. This is done because the current counter only counts active queues which allows the user to create more queues than the max_num_of_queues_per_device module parameter allows. However, we need the current counter for the runlist packet build process, so the solution is to have a dedicated counter for this accounting. Signed-off-by: Oded Gabbay <oded.gabbay@amd.com> Reviewed-by: Ben Goz <ben.goz@amd.com>
2015-01-18 20:18:01 +09:00
dqm->total_queue_count);
dqm_unlock(dqm);
drm/amdkfd: Allow user to limit only queues per device This patch replaces the two current amdkfd module parameters with a new one. The current parameters that are being replaced are: - Maximum number of HSA processes - Maximum number of queues per process The new parameter that replaces them is called "Maximum queues per device" This replacement achieves two goals: - Allows the user to have as many HSA processes as it wants (until a maximum of 512 HSA processes in Kaveri). - Removes the limitation the user had on maximum number of queues per HSA process. E.g. the user can now have processes which only have one queue and other processes which have hundreds of queues, while before the user couldn't have more than 128 queues per process (as default). The default value of the new parameter is 4096 (32 * 128, which were the defaults of the old parameters). There is almost no additional GART memory required for the default case. As a reminder, this amount of queues requires a little bit below 4MB of GART memory. v2: In addition, This patch defines a new counter for queues accounting in the DQM structure. This is done because the current counter only counts active queues which allows the user to create more queues than the max_num_of_queues_per_device module parameter allows. However, we need the current counter for the runlist packet build process, so the solution is to have a dedicated counter for this accounting. Signed-off-by: Oded Gabbay <oded.gabbay@amd.com> Reviewed-by: Ben Goz <ben.goz@amd.com>
2015-01-18 20:18:01 +09:00
return -EPERM;
}
/*
* Unconditionally increment this counter, regardless of the queue's
* type or whether the queue is active.
*/
dqm->total_queue_count++;
pr_debug("Total of %d queues are accountable so far\n",
dqm->total_queue_count);
2014-07-17 07:27:00 +09:00
list_add(&kq->list, &qpd->priv_queue_list);
dqm->queue_count++;
qpd->is_debug = true;
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
dqm_unlock(dqm);
2014-07-17 07:27:00 +09:00
return 0;
}
static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
struct kernel_queue *kq,
struct qcm_process_device *qpd)
{
dqm_lock(dqm);
2014-07-17 07:27:00 +09:00
list_del(&kq->list);
dqm->queue_count--;
qpd->is_debug = false;
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
drm/amdkfd: Allow user to limit only queues per device This patch replaces the two current amdkfd module parameters with a new one. The current parameters that are being replaced are: - Maximum number of HSA processes - Maximum number of queues per process The new parameter that replaces them is called "Maximum queues per device" This replacement achieves two goals: - Allows the user to have as many HSA processes as it wants (until a maximum of 512 HSA processes in Kaveri). - Removes the limitation the user had on maximum number of queues per HSA process. E.g. the user can now have processes which only have one queue and other processes which have hundreds of queues, while before the user couldn't have more than 128 queues per process (as default). The default value of the new parameter is 4096 (32 * 128, which were the defaults of the old parameters). There is almost no additional GART memory required for the default case. As a reminder, this amount of queues requires a little bit below 4MB of GART memory. v2: In addition, This patch defines a new counter for queues accounting in the DQM structure. This is done because the current counter only counts active queues which allows the user to create more queues than the max_num_of_queues_per_device module parameter allows. However, we need the current counter for the runlist packet build process, so the solution is to have a dedicated counter for this accounting. Signed-off-by: Oded Gabbay <oded.gabbay@amd.com> Reviewed-by: Ben Goz <ben.goz@amd.com>
2015-01-18 20:18:01 +09:00
/*
* Unconditionally decrement this counter, regardless of the queue's
* type.
*/
dqm->total_queue_count--;
drm/amdkfd: Allow user to limit only queues per device This patch replaces the two current amdkfd module parameters with a new one. The current parameters that are being replaced are: - Maximum number of HSA processes - Maximum number of queues per process The new parameter that replaces them is called "Maximum queues per device" This replacement achieves two goals: - Allows the user to have as many HSA processes as it wants (until a maximum of 512 HSA processes in Kaveri). - Removes the limitation the user had on maximum number of queues per HSA process. E.g. the user can now have processes which only have one queue and other processes which have hundreds of queues, while before the user couldn't have more than 128 queues per process (as default). The default value of the new parameter is 4096 (32 * 128, which were the defaults of the old parameters). There is almost no additional GART memory required for the default case. As a reminder, this amount of queues requires a little bit below 4MB of GART memory. v2: In addition, This patch defines a new counter for queues accounting in the DQM structure. This is done because the current counter only counts active queues which allows the user to create more queues than the max_num_of_queues_per_device module parameter allows. However, we need the current counter for the runlist packet build process, so the solution is to have a dedicated counter for this accounting. Signed-off-by: Oded Gabbay <oded.gabbay@amd.com> Reviewed-by: Ben Goz <ben.goz@amd.com>
2015-01-18 20:18:01 +09:00
pr_debug("Total of %d queues are accountable so far\n",
dqm->total_queue_count);
dqm_unlock(dqm);
2014-07-17 07:27:00 +09:00
}
static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
struct qcm_process_device *qpd)
2014-07-17 07:27:00 +09:00
{
int retval;
struct mqd_manager *mqd_mgr;
2014-07-17 07:27:00 +09:00
drm/amdkfd: Allow user to limit only queues per device This patch replaces the two current amdkfd module parameters with a new one. The current parameters that are being replaced are: - Maximum number of HSA processes - Maximum number of queues per process The new parameter that replaces them is called "Maximum queues per device" This replacement achieves two goals: - Allows the user to have as many HSA processes as it wants (until a maximum of 512 HSA processes in Kaveri). - Removes the limitation the user had on maximum number of queues per HSA process. E.g. the user can now have processes which only have one queue and other processes which have hundreds of queues, while before the user couldn't have more than 128 queues per process (as default). The default value of the new parameter is 4096 (32 * 128, which were the defaults of the old parameters). There is almost no additional GART memory required for the default case. As a reminder, this amount of queues requires a little bit below 4MB of GART memory. v2: In addition, This patch defines a new counter for queues accounting in the DQM structure. This is done because the current counter only counts active queues which allows the user to create more queues than the max_num_of_queues_per_device module parameter allows. However, we need the current counter for the runlist packet build process, so the solution is to have a dedicated counter for this accounting. Signed-off-by: Oded Gabbay <oded.gabbay@amd.com> Reviewed-by: Ben Goz <ben.goz@amd.com>
2015-01-18 20:18:01 +09:00
if (dqm->total_queue_count >= max_num_of_queues_per_device) {
pr_warn("Can't create new usermode queue because %d queues were already created\n",
drm/amdkfd: Allow user to limit only queues per device This patch replaces the two current amdkfd module parameters with a new one. The current parameters that are being replaced are: - Maximum number of HSA processes - Maximum number of queues per process The new parameter that replaces them is called "Maximum queues per device" This replacement achieves two goals: - Allows the user to have as many HSA processes as it wants (until a maximum of 512 HSA processes in Kaveri). - Removes the limitation the user had on maximum number of queues per HSA process. E.g. the user can now have processes which only have one queue and other processes which have hundreds of queues, while before the user couldn't have more than 128 queues per process (as default). The default value of the new parameter is 4096 (32 * 128, which were the defaults of the old parameters). There is almost no additional GART memory required for the default case. As a reminder, this amount of queues requires a little bit below 4MB of GART memory. v2: In addition, This patch defines a new counter for queues accounting in the DQM structure. This is done because the current counter only counts active queues which allows the user to create more queues than the max_num_of_queues_per_device module parameter allows. However, we need the current counter for the runlist packet build process, so the solution is to have a dedicated counter for this accounting. Signed-off-by: Oded Gabbay <oded.gabbay@amd.com> Reviewed-by: Ben Goz <ben.goz@amd.com>
2015-01-18 20:18:01 +09:00
dqm->total_queue_count);
retval = -EPERM;
goto out;
drm/amdkfd: Allow user to limit only queues per device This patch replaces the two current amdkfd module parameters with a new one. The current parameters that are being replaced are: - Maximum number of HSA processes - Maximum number of queues per process The new parameter that replaces them is called "Maximum queues per device" This replacement achieves two goals: - Allows the user to have as many HSA processes as it wants (until a maximum of 512 HSA processes in Kaveri). - Removes the limitation the user had on maximum number of queues per HSA process. E.g. the user can now have processes which only have one queue and other processes which have hundreds of queues, while before the user couldn't have more than 128 queues per process (as default). The default value of the new parameter is 4096 (32 * 128, which were the defaults of the old parameters). There is almost no additional GART memory required for the default case. As a reminder, this amount of queues requires a little bit below 4MB of GART memory. v2: In addition, This patch defines a new counter for queues accounting in the DQM structure. This is done because the current counter only counts active queues which allows the user to create more queues than the max_num_of_queues_per_device module parameter allows. However, we need the current counter for the runlist packet build process, so the solution is to have a dedicated counter for this accounting. Signed-off-by: Oded Gabbay <oded.gabbay@amd.com> Reviewed-by: Ben Goz <ben.goz@amd.com>
2015-01-18 20:18:01 +09:00
}
if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
dqm_lock(dqm);
retval = allocate_sdma_queue(dqm, q);
dqm_unlock(dqm);
if (retval)
goto out;
}
retval = allocate_doorbell(qpd, q);
if (retval)
goto out_deallocate_sdma_queue;
mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
q->properties.type)];
if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
q->properties.tba_addr = qpd->tba_addr;
q->properties.tma_addr = qpd->tma_addr;
q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr->dev, &q->properties);
if (!q->mqd_mem_obj) {
retval = -ENOMEM;
goto out_deallocate_doorbell;
}
dqm_lock(dqm);
/*
* Eviction state logic: mark all queues as evicted, even ones
* not currently active. Restoring inactive queues later only
* updates the is_evicted flag but is a no-op otherwise.
*/
q->properties.is_evicted = !!qpd->evicted;
mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
&q->gart_mqd_addr, &q->properties);
2014-07-17 07:27:00 +09:00
list_add(&q->list, &qpd->queues_list);
qpd->queue_count++;
if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
dqm->sdma_queue_count++;
else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
dqm->xgmi_sdma_queue_count++;
2014-07-17 07:27:00 +09:00
if (q->properties.is_active) {
dqm->queue_count++;
retval = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
2014-07-17 07:27:00 +09:00
}
drm/amdkfd: Allow user to limit only queues per device This patch replaces the two current amdkfd module parameters with a new one. The current parameters that are being replaced are: - Maximum number of HSA processes - Maximum number of queues per process The new parameter that replaces them is called "Maximum queues per device" This replacement achieves two goals: - Allows the user to have as many HSA processes as it wants (until a maximum of 512 HSA processes in Kaveri). - Removes the limitation the user had on maximum number of queues per HSA process. E.g. the user can now have processes which only have one queue and other processes which have hundreds of queues, while before the user couldn't have more than 128 queues per process (as default). The default value of the new parameter is 4096 (32 * 128, which were the defaults of the old parameters). There is almost no additional GART memory required for the default case. As a reminder, this amount of queues requires a little bit below 4MB of GART memory. v2: In addition, This patch defines a new counter for queues accounting in the DQM structure. This is done because the current counter only counts active queues which allows the user to create more queues than the max_num_of_queues_per_device module parameter allows. However, we need the current counter for the runlist packet build process, so the solution is to have a dedicated counter for this accounting. Signed-off-by: Oded Gabbay <oded.gabbay@amd.com> Reviewed-by: Ben Goz <ben.goz@amd.com>
2015-01-18 20:18:01 +09:00
/*
* Unconditionally increment this counter, regardless of the queue's
* type or whether the queue is active.
*/
dqm->total_queue_count++;
pr_debug("Total of %d queues are accountable so far\n",
dqm->total_queue_count);
dqm_unlock(dqm);
return retval;
out_deallocate_doorbell:
deallocate_doorbell(qpd, q);
out_deallocate_sdma_queue:
if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
dqm_lock(dqm);
deallocate_sdma_queue(dqm, q);
dqm_unlock(dqm);
}
out:
2014-07-17 07:27:00 +09:00
return retval;
}
int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
unsigned int fence_value,
unsigned int timeout_ms)
2014-07-17 07:27:00 +09:00
{
unsigned long end_jiffies = msecs_to_jiffies(timeout_ms) + jiffies;
2014-07-17 07:27:00 +09:00
while (*fence_addr != fence_value) {
if (time_after(jiffies, end_jiffies)) {
pr_err("qcm fence wait loop timeout expired\n");
/* In HWS case, this is used to halt the driver thread
* in order not to mess up CP states before doing
* scandumps for FW debugging.
*/
while (halt_if_hws_hang)
schedule();
2014-07-17 07:27:00 +09:00
return -ETIME;
}
schedule();
2014-07-17 07:27:00 +09:00
}
return 0;
}
static int unmap_sdma_queues(struct device_queue_manager *dqm)
{
int i, retval = 0;
for (i = 0; i < dqm->dev->device_info->num_sdma_engines +
dqm->dev->device_info->num_xgmi_sdma_engines; i++) {
retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false, i);
if (retval)
return retval;
}
return retval;
}
/* dqm->lock mutex has to be locked before calling this function */
static int map_queues_cpsch(struct device_queue_manager *dqm)
{
int retval;
if (dqm->queue_count <= 0 || dqm->processes_count <= 0)
return 0;
if (dqm->active_runlist)
return 0;
retval = pm_send_runlist(&dqm->packets, &dqm->queues);
pr_debug("%s sent runlist\n", __func__);
if (retval) {
pr_err("failed to execute runlist\n");
return retval;
}
dqm->active_runlist = true;
return retval;
}
/* dqm->lock mutex has to be locked before calling this function */
static int unmap_queues_cpsch(struct device_queue_manager *dqm,
enum kfd_unmap_queues_filter filter,
uint32_t filter_param)
2014-07-17 07:27:00 +09:00
{
int retval = 0;
2014-07-17 07:27:00 +09:00
if (dqm->is_hws_hang)
return -EIO;
if (!dqm->active_runlist)
return retval;
pr_debug("Before destroying queues, sdma queue count is : %u, xgmi sdma queue count is : %u\n",
dqm->sdma_queue_count, dqm->xgmi_sdma_queue_count);
if (dqm->sdma_queue_count > 0 || dqm->xgmi_sdma_queue_count)
unmap_sdma_queues(dqm);
2014-07-17 07:27:00 +09:00
retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE,
filter, filter_param, false, 0);
if (retval)
return retval;
2014-07-17 07:27:00 +09:00
*dqm->fence_addr = KFD_FENCE_INIT;
pm_send_query_status(&dqm->packets, dqm->fence_gpu_addr,
KFD_FENCE_COMPLETED);
/* should be timed out */
retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
queue_preemption_timeout_ms);
if (retval)
return retval;
2014-07-17 07:27:00 +09:00
pm_release_ib(&dqm->packets);
dqm->active_runlist = false;
return retval;
}
/* dqm->lock mutex has to be locked before calling this function */
static int execute_queues_cpsch(struct device_queue_manager *dqm,
enum kfd_unmap_queues_filter filter,
uint32_t filter_param)
2014-07-17 07:27:00 +09:00
{
int retval;
if (dqm->is_hws_hang)
return -EIO;
retval = unmap_queues_cpsch(dqm, filter, filter_param);
if (retval) {
pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
dqm->is_hws_hang = true;
schedule_work(&dqm->hw_exception_work);
return retval;
2014-07-17 07:27:00 +09:00
}
return map_queues_cpsch(dqm);
2014-07-17 07:27:00 +09:00
}
static int destroy_queue_cpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd,
struct queue *q)
{
int retval;
struct mqd_manager *mqd_mgr;
2014-07-17 07:27:00 +09:00
retval = 0;
/* remove queue from list to prevent rescheduling after preemption */
dqm_lock(dqm);
if (qpd->is_debug) {
/*
* error, currently we do not allow to destroy a queue
* of a currently debugged process
*/
retval = -EBUSY;
goto failed_try_destroy_debugged_queue;
}
mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
q->properties.type)];
2014-07-17 07:27:00 +09:00
deallocate_doorbell(qpd, q);
if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
dqm->sdma_queue_count--;
deallocate_sdma_queue(dqm, q);
} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
dqm->xgmi_sdma_queue_count--;
deallocate_sdma_queue(dqm, q);
}
2014-07-17 07:27:00 +09:00
list_del(&q->list);
qpd->queue_count--;
if (q->properties.is_active) {
dqm->queue_count--;
retval = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
if (retval == -ETIME)
qpd->reset_wavefronts = true;
}
2014-07-17 07:27:00 +09:00
drm/amdkfd: Allow user to limit only queues per device This patch replaces the two current amdkfd module parameters with a new one. The current parameters that are being replaced are: - Maximum number of HSA processes - Maximum number of queues per process The new parameter that replaces them is called "Maximum queues per device" This replacement achieves two goals: - Allows the user to have as many HSA processes as it wants (until a maximum of 512 HSA processes in Kaveri). - Removes the limitation the user had on maximum number of queues per HSA process. E.g. the user can now have processes which only have one queue and other processes which have hundreds of queues, while before the user couldn't have more than 128 queues per process (as default). The default value of the new parameter is 4096 (32 * 128, which were the defaults of the old parameters). There is almost no additional GART memory required for the default case. As a reminder, this amount of queues requires a little bit below 4MB of GART memory. v2: In addition, This patch defines a new counter for queues accounting in the DQM structure. This is done because the current counter only counts active queues which allows the user to create more queues than the max_num_of_queues_per_device module parameter allows. However, we need the current counter for the runlist packet build process, so the solution is to have a dedicated counter for this accounting. Signed-off-by: Oded Gabbay <oded.gabbay@amd.com> Reviewed-by: Ben Goz <ben.goz@amd.com>
2015-01-18 20:18:01 +09:00
/*
* Unconditionally decrement this counter, regardless of the queue's
* type
*/
dqm->total_queue_count--;
pr_debug("Total of %d queues are accountable so far\n",
dqm->total_queue_count);
2014-07-17 07:27:00 +09:00
dqm_unlock(dqm);
2014-07-17 07:27:00 +09:00
/* Do free_mqd after dqm_unlock(dqm) to avoid circular locking */
mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
return retval;
2014-07-17 07:27:00 +09:00
failed_try_destroy_debugged_queue:
dqm_unlock(dqm);
2014-07-17 07:27:00 +09:00
return retval;
}
/*
* Low bits must be 0000/FFFF as required by HW, high bits must be 0 to
* stay in user mode.
*/
#define APE1_FIXED_BITS_MASK 0xFFFF80000000FFFFULL
/* APE1 limit is inclusive and 64K aligned. */
#define APE1_LIMIT_ALIGNMENT 0xFFFF
static bool set_cache_memory_policy(struct device_queue_manager *dqm,
struct qcm_process_device *qpd,
enum cache_policy default_policy,
enum cache_policy alternate_policy,
void __user *alternate_aperture_base,
uint64_t alternate_aperture_size)
{
bool retval = true;
if (!dqm->asic_ops.set_cache_memory_policy)
return retval;
2014-07-17 07:27:00 +09:00
dqm_lock(dqm);
2014-07-17 07:27:00 +09:00
if (alternate_aperture_size == 0) {
/* base > limit disables APE1 */
qpd->sh_mem_ape1_base = 1;
qpd->sh_mem_ape1_limit = 0;
} else {
/*
* In FSA64, APE1_Base[63:0] = { 16{SH_MEM_APE1_BASE[31]},
* SH_MEM_APE1_BASE[31:0], 0x0000 }
* APE1_Limit[63:0] = { 16{SH_MEM_APE1_LIMIT[31]},
* SH_MEM_APE1_LIMIT[31:0], 0xFFFF }
* Verify that the base and size parameters can be
* represented in this format and convert them.
* Additionally restrict APE1 to user-mode addresses.
*/
uint64_t base = (uintptr_t)alternate_aperture_base;
uint64_t limit = base + alternate_aperture_size - 1;
if (limit <= base || (base & APE1_FIXED_BITS_MASK) != 0 ||
(limit & APE1_FIXED_BITS_MASK) != APE1_LIMIT_ALIGNMENT) {
retval = false;
2014-07-17 07:27:00 +09:00
goto out;
}
2014-07-17 07:27:00 +09:00
qpd->sh_mem_ape1_base = base >> 16;
qpd->sh_mem_ape1_limit = limit >> 16;
}
retval = dqm->asic_ops.set_cache_memory_policy(
dqm,
qpd,
default_policy,
alternate_policy,
alternate_aperture_base,
alternate_aperture_size);
2014-07-17 07:27:00 +09:00
if ((dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) && (qpd->vmid != 0))
2014-07-17 07:27:00 +09:00
program_sh_mem_settings(dqm, qpd);
pr_debug("sh_mem_config: 0x%x, ape1_base: 0x%x, ape1_limit: 0x%x\n",
2014-07-17 07:27:00 +09:00
qpd->sh_mem_config, qpd->sh_mem_ape1_base,
qpd->sh_mem_ape1_limit);
out:
dqm_unlock(dqm);
return retval;
2014-07-17 07:27:00 +09:00
}
static int set_trap_handler(struct device_queue_manager *dqm,
struct qcm_process_device *qpd,
uint64_t tba_addr,
uint64_t tma_addr)
{
uint64_t *tma;
if (dqm->dev->cwsr_enabled) {
/* Jump from CWSR trap handler to user trap */
tma = (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
tma[0] = tba_addr;
tma[1] = tma_addr;
} else {
qpd->tba_addr = tba_addr;
qpd->tma_addr = tma_addr;
}
return 0;
}
static int process_termination_nocpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
struct queue *q, *next;
struct device_process_node *cur, *next_dpn;
int retval = 0;
bool found = false;
dqm_lock(dqm);
/* Clear all user mode queues */
list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
int ret;
ret = destroy_queue_nocpsch_locked(dqm, qpd, q);
if (ret)
retval = ret;
}
/* Unregister process */
list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) {
if (qpd == cur->qpd) {
list_del(&cur->list);
kfree(cur);
dqm->processes_count--;
found = true;
break;
}
}
dqm_unlock(dqm);
/* Outside the DQM lock because under the DQM lock we can't do
* reclaim or take other locks that others hold while reclaiming.
*/
if (found)
kfd_dec_compute_active(dqm->dev);
return retval;
}
static int get_wave_state(struct device_queue_manager *dqm,
struct queue *q,
void __user *ctl_stack,
u32 *ctl_stack_used_size,
u32 *save_area_used_size)
{
struct mqd_manager *mqd_mgr;
int r;
dqm_lock(dqm);
if (q->properties.type != KFD_QUEUE_TYPE_COMPUTE ||
q->properties.is_active || !q->device->cwsr_enabled) {
r = -EINVAL;
goto dqm_unlock;
}
mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_COMPUTE];
if (!mqd_mgr->get_wave_state) {
r = -EINVAL;
goto dqm_unlock;
}
r = mqd_mgr->get_wave_state(mqd_mgr, q->mqd, ctl_stack,
ctl_stack_used_size, save_area_used_size);
dqm_unlock:
dqm_unlock(dqm);
return r;
}
static int process_termination_cpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
int retval;
struct queue *q;
struct kernel_queue *kq, *kq_next;
struct mqd_manager *mqd_mgr;
struct device_process_node *cur, *next_dpn;
enum kfd_unmap_queues_filter filter =
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES;
bool found = false;
retval = 0;
dqm_lock(dqm);
/* Clean all kernel queues */
list_for_each_entry_safe(kq, kq_next, &qpd->priv_queue_list, list) {
list_del(&kq->list);
dqm->queue_count--;
qpd->is_debug = false;
dqm->total_queue_count--;
filter = KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES;
}
/* Clear all user mode queues */
list_for_each_entry(q, &qpd->queues_list, list) {
if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
dqm->sdma_queue_count--;
deallocate_sdma_queue(dqm, q);
} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
dqm->xgmi_sdma_queue_count--;
deallocate_sdma_queue(dqm, q);
}
if (q->properties.is_active)
dqm->queue_count--;
dqm->total_queue_count--;
}
/* Unregister process */
list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) {
if (qpd == cur->qpd) {
list_del(&cur->list);
kfree(cur);
dqm->processes_count--;
found = true;
break;
}
}
retval = execute_queues_cpsch(dqm, filter, 0);
if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) {
pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev);
dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process);
qpd->reset_wavefronts = false;
}
/* Lastly, free mqd resources.
* Do free_mqd() after dqm_unlock to avoid circular locking.
*/
while (!list_empty(&qpd->queues_list)) {
q = list_first_entry(&qpd->queues_list, struct queue, list);
mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
q->properties.type)];
list_del(&q->list);
qpd->queue_count--;
dqm_unlock(dqm);
mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
dqm_lock(dqm);
}
dqm_unlock(dqm);
/* Outside the DQM lock because under the DQM lock we can't do
* reclaim or take other locks that others hold while reclaiming.
*/
if (found)
kfd_dec_compute_active(dqm->dev);
return retval;
}
static int init_mqd_managers(struct device_queue_manager *dqm)
{
int i, j;
struct mqd_manager *mqd_mgr;
for (i = 0; i < KFD_MQD_TYPE_MAX; i++) {
mqd_mgr = dqm->asic_ops.mqd_manager_init(i, dqm->dev);
if (!mqd_mgr) {
pr_err("mqd manager [%d] initialization failed\n", i);
goto out_free;
}
dqm->mqd_mgrs[i] = mqd_mgr;
}
return 0;
out_free:
for (j = 0; j < i; j++) {
kfree(dqm->mqd_mgrs[j]);
dqm->mqd_mgrs[j] = NULL;
}
return -ENOMEM;
}
/* Allocate one hiq mqd (HWS) and all SDMA mqd in a continuous trunk*/
static int allocate_hiq_sdma_mqd(struct device_queue_manager *dqm)
{
int retval;
struct kfd_dev *dev = dqm->dev;
struct kfd_mem_obj *mem_obj = &dqm->hiq_sdma_mqd;
uint32_t size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size *
(dev->device_info->num_sdma_engines +
dev->device_info->num_xgmi_sdma_engines) *
dev->device_info->num_sdma_queues_per_engine +
dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
retval = amdgpu_amdkfd_alloc_gtt_mem(dev->kgd, size,
&(mem_obj->gtt_mem), &(mem_obj->gpu_addr),
(void *)&(mem_obj->cpu_ptr), true);
return retval;
}
2014-07-17 07:27:00 +09:00
struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
{
struct device_queue_manager *dqm;
pr_debug("Loading device queue manager\n");
dqm = kzalloc(sizeof(*dqm), GFP_KERNEL);
2014-07-17 07:27:00 +09:00
if (!dqm)
return NULL;
switch (dev->device_info->asic_family) {
/* HWS is not available on Hawaii. */
case CHIP_HAWAII:
/* HWS depends on CWSR for timely dequeue. CWSR is not
* available on Tonga.
*
* FIXME: This argument also applies to Kaveri.
*/
case CHIP_TONGA:
dqm->sched_policy = KFD_SCHED_POLICY_NO_HWS;
break;
default:
dqm->sched_policy = sched_policy;
break;
}
2014-07-17 07:27:00 +09:00
dqm->dev = dev;
switch (dqm->sched_policy) {
2014-07-17 07:27:00 +09:00
case KFD_SCHED_POLICY_HWS:
case KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION:
/* initialize dqm for cp scheduling */
dqm->ops.create_queue = create_queue_cpsch;
dqm->ops.initialize = initialize_cpsch;
dqm->ops.start = start_cpsch;
dqm->ops.stop = stop_cpsch;
dqm->ops.destroy_queue = destroy_queue_cpsch;
dqm->ops.update_queue = update_queue;
dqm->ops.register_process = register_process;
dqm->ops.unregister_process = unregister_process;
dqm->ops.uninitialize = uninitialize;
dqm->ops.create_kernel_queue = create_kernel_queue_cpsch;
dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch;
dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
dqm->ops.set_trap_handler = set_trap_handler;
dqm->ops.process_termination = process_termination_cpsch;
dqm->ops.evict_process_queues = evict_process_queues_cpsch;
dqm->ops.restore_process_queues = restore_process_queues_cpsch;
dqm->ops.get_wave_state = get_wave_state;
2014-07-17 07:27:00 +09:00
break;
case KFD_SCHED_POLICY_NO_HWS:
/* initialize dqm for no cp scheduling */
dqm->ops.start = start_nocpsch;
dqm->ops.stop = stop_nocpsch;
dqm->ops.create_queue = create_queue_nocpsch;
dqm->ops.destroy_queue = destroy_queue_nocpsch;
dqm->ops.update_queue = update_queue;
dqm->ops.register_process = register_process;
dqm->ops.unregister_process = unregister_process;
dqm->ops.initialize = initialize_nocpsch;
dqm->ops.uninitialize = uninitialize;
dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
dqm->ops.set_trap_handler = set_trap_handler;
dqm->ops.process_termination = process_termination_nocpsch;
dqm->ops.evict_process_queues = evict_process_queues_nocpsch;
dqm->ops.restore_process_queues =
restore_process_queues_nocpsch;
dqm->ops.get_wave_state = get_wave_state;
2014-07-17 07:27:00 +09:00
break;
default:
pr_err("Invalid scheduling policy %d\n", dqm->sched_policy);
goto out_free;
2014-07-17 07:27:00 +09:00
}
switch (dev->device_info->asic_family) {
case CHIP_CARRIZO:
device_queue_manager_init_vi(&dqm->asic_ops);
break;
case CHIP_KAVERI:
device_queue_manager_init_cik(&dqm->asic_ops);
break;
case CHIP_HAWAII:
device_queue_manager_init_cik_hawaii(&dqm->asic_ops);
break;
case CHIP_TONGA:
case CHIP_FIJI:
case CHIP_POLARIS10:
case CHIP_POLARIS11:
case CHIP_POLARIS12:
case CHIP_VEGAM:
device_queue_manager_init_vi_tonga(&dqm->asic_ops);
break;
case CHIP_VEGA10:
case CHIP_VEGA12:
case CHIP_VEGA20:
case CHIP_RAVEN:
case CHIP_ARCTURUS:
device_queue_manager_init_v9(&dqm->asic_ops);
break;
case CHIP_NAVI10:
device_queue_manager_init_v10_navi10(&dqm->asic_ops);
break;
default:
WARN(1, "Unexpected ASIC family %u",
dev->device_info->asic_family);
goto out_free;
}
if (init_mqd_managers(dqm))
goto out_free;
if (allocate_hiq_sdma_mqd(dqm)) {
pr_err("Failed to allocate hiq sdma mqd trunk buffer\n");
goto out_free;
}
if (!dqm->ops.initialize(dqm))
return dqm;
2014-07-17 07:27:00 +09:00
out_free:
kfree(dqm);
return NULL;
2014-07-17 07:27:00 +09:00
}
static void deallocate_hiq_sdma_mqd(struct kfd_dev *dev,
struct kfd_mem_obj *mqd)
{
WARN(!mqd, "No hiq sdma mqd trunk to free");
amdgpu_amdkfd_free_gtt_mem(dev->kgd, mqd->gtt_mem);
}
2014-07-17 07:27:00 +09:00
void device_queue_manager_uninit(struct device_queue_manager *dqm)
{
dqm->ops.uninitialize(dqm);
deallocate_hiq_sdma_mqd(dqm->dev, &dqm->hiq_sdma_mqd);
2014-07-17 07:27:00 +09:00
kfree(dqm);
}
int kfd_process_vm_fault(struct device_queue_manager *dqm,
unsigned int pasid)
{
struct kfd_process_device *pdd;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
int ret = 0;
if (!p)
return -EINVAL;
pdd = kfd_get_process_device_data(dqm->dev, p);
if (pdd)
ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
kfd_unref_process(p);
return ret;
}
static void kfd_process_hw_exception(struct work_struct *work)
{
struct device_queue_manager *dqm = container_of(work,
struct device_queue_manager, hw_exception_work);
amdgpu_amdkfd_gpu_reset(dqm->dev->kgd);
}
#if defined(CONFIG_DEBUG_FS)
static void seq_reg_dump(struct seq_file *m,
uint32_t (*dump)[2], uint32_t n_regs)
{
uint32_t i, count;
for (i = 0, count = 0; i < n_regs; i++) {
if (count == 0 ||
dump[i-1][0] + sizeof(uint32_t) != dump[i][0]) {
seq_printf(m, "%s %08x: %08x",
i ? "\n" : "",
dump[i][0], dump[i][1]);
count = 7;
} else {
seq_printf(m, " %08x", dump[i][1]);
count--;
}
}
seq_puts(m, "\n");
}
int dqm_debugfs_hqds(struct seq_file *m, void *data)
{
struct device_queue_manager *dqm = data;
uint32_t (*dump)[2], n_regs;
int pipe, queue;
int r = 0;
r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd,
KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE,
&dump, &n_regs);
if (!r) {
seq_printf(m, " HIQ on MEC %d Pipe %d Queue %d\n",
KFD_CIK_HIQ_PIPE/get_pipes_per_mec(dqm)+1,
KFD_CIK_HIQ_PIPE%get_pipes_per_mec(dqm),
KFD_CIK_HIQ_QUEUE);
seq_reg_dump(m, dump, n_regs);
kfree(dump);
}
for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) {
int pipe_offset = pipe * get_queues_per_pipe(dqm);
for (queue = 0; queue < get_queues_per_pipe(dqm); queue++) {
if (!test_bit(pipe_offset + queue,
dqm->dev->shared_resources.queue_bitmap))
continue;
r = dqm->dev->kfd2kgd->hqd_dump(
dqm->dev->kgd, pipe, queue, &dump, &n_regs);
if (r)
break;
seq_printf(m, " CP Pipe %d, Queue %d\n",
pipe, queue);
seq_reg_dump(m, dump, n_regs);
kfree(dump);
}
}
for (pipe = 0; pipe < get_num_sdma_engines(dqm); pipe++) {
for (queue = 0;
queue < dqm->dev->device_info->num_sdma_queues_per_engine;
queue++) {
r = dqm->dev->kfd2kgd->hqd_sdma_dump(
dqm->dev->kgd, pipe, queue, &dump, &n_regs);
if (r)
break;
seq_printf(m, " SDMA Engine %d, RLC %d\n",
pipe, queue);
seq_reg_dump(m, dump, n_regs);
kfree(dump);
}
}
return r;
}
int dqm_debugfs_execute_queues(struct device_queue_manager *dqm)
{
int r = 0;
dqm_lock(dqm);
dqm->active_runlist = true;
r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
dqm_unlock(dqm);
return r;
}
#endif