blob: f0cd25b18cc28cef698d8ff34bf282e7b229d44d [file] [log] [blame] [edit]
/*
* Copyright © 2022 Collabora Ltd. and Red Hat Inc.
* SPDX-License-Identifier: MIT
*/
#include "nvk_queue.h"
#include "nvk_buffer.h"
#include "nvk_cmd_buffer.h"
#include "nvk_device.h"
#include "nvk_image.h"
#include "nvk_physical_device.h"
#include "nv_push.h"
#include "nv_push_cl9039.h"
#include "nv_push_cl9097.h"
#include "nv_push_cl90b5.h"
#include "nv_push_cla0c0.h"
#include "cla1c0.h"
#include "nv_push_clc3c0.h"
#include "nv_push_clc397.h"
static VkResult
nvk_queue_push(struct nvk_queue *queue, const struct nv_push *push);
static void
nvk_queue_state_init(struct nvk_queue_state *qs)
{
memset(qs, 0, sizeof(*qs));
}
static void
nvk_queue_state_finish(struct nvk_device *dev,
struct nvk_queue_state *qs)
{
if (qs->slm.mem)
nvkmd_mem_unref(qs->slm.mem);
}
static VkResult
nvk_queue_state_update(struct nvk_queue *queue,
struct nvk_queue_state *qs)
{
struct nvk_device *dev = nvk_queue_device(queue);
const struct nvk_physical_device *pdev = nvk_device_physical(dev);
struct nvkmd_mem *mem;
uint32_t alloc_count, bytes_per_warp, bytes_per_tpc;
bool dirty = false;
alloc_count = nvk_descriptor_table_alloc_count(&dev->images);
if (qs->images.alloc_count != alloc_count) {
qs->images.alloc_count = alloc_count;
dirty = true;
}
alloc_count = nvk_descriptor_table_alloc_count(&dev->samplers);
if (qs->samplers.alloc_count != alloc_count) {
qs->samplers.alloc_count = alloc_count;
dirty = true;
}
mem = nvk_slm_area_get_mem_ref(&dev->slm, &bytes_per_warp, &bytes_per_tpc);
if (qs->slm.mem != mem || qs->slm.bytes_per_warp != bytes_per_warp ||
qs->slm.bytes_per_tpc != bytes_per_tpc) {
if (qs->slm.mem)
nvkmd_mem_unref(qs->slm.mem);
qs->slm.mem = mem;
qs->slm.bytes_per_warp = bytes_per_warp;
qs->slm.bytes_per_tpc = bytes_per_tpc;
dirty = true;
} else {
/* No change */
if (mem)
nvkmd_mem_unref(mem);
}
if (!dirty)
return VK_SUCCESS;
uint32_t push_data[64];
struct nv_push push;
nv_push_init(&push, push_data, 64,
nvk_queue_subchannels_from_engines(queue->engines));
struct nv_push *p = &push;
if (qs->images.alloc_count > 0) {
const uint64_t tex_pool_addr =
nvk_descriptor_table_base_address(&dev->images);
if (queue->engines & NVKMD_ENGINE_COMPUTE) {
P_MTHD(p, NVA0C0, SET_TEX_HEADER_POOL_A);
P_NVA0C0_SET_TEX_HEADER_POOL_A(p, tex_pool_addr >> 32);
P_NVA0C0_SET_TEX_HEADER_POOL_B(p, tex_pool_addr);
P_NVA0C0_SET_TEX_HEADER_POOL_C(p, qs->images.alloc_count - 1);
P_IMMD(p, NVA0C0, INVALIDATE_TEXTURE_HEADER_CACHE_NO_WFI, {
.lines = LINES_ALL
});
}
if (queue->engines & NVKMD_ENGINE_3D) {
P_MTHD(p, NV9097, SET_TEX_HEADER_POOL_A);
P_NV9097_SET_TEX_HEADER_POOL_A(p, tex_pool_addr >> 32);
P_NV9097_SET_TEX_HEADER_POOL_B(p, tex_pool_addr);
P_NV9097_SET_TEX_HEADER_POOL_C(p, qs->images.alloc_count - 1);
P_IMMD(p, NV9097, INVALIDATE_TEXTURE_HEADER_CACHE_NO_WFI, {
.lines = LINES_ALL
});
}
}
if (qs->samplers.alloc_count > 0) {
const uint64_t sampler_pool_addr =
nvk_descriptor_table_base_address(&dev->samplers);
if (queue->engines & NVKMD_ENGINE_COMPUTE) {
P_MTHD(p, NVA0C0, SET_TEX_SAMPLER_POOL_A);
P_NVA0C0_SET_TEX_SAMPLER_POOL_A(p, sampler_pool_addr >> 32);
P_NVA0C0_SET_TEX_SAMPLER_POOL_B(p, sampler_pool_addr);
P_NVA0C0_SET_TEX_SAMPLER_POOL_C(p, qs->samplers.alloc_count - 1);
P_IMMD(p, NVA0C0, INVALIDATE_SAMPLER_CACHE_NO_WFI, {
.lines = LINES_ALL
});
}
if (queue->engines & NVKMD_ENGINE_3D) {
P_MTHD(p, NV9097, SET_TEX_SAMPLER_POOL_A);
P_NV9097_SET_TEX_SAMPLER_POOL_A(p, sampler_pool_addr >> 32);
P_NV9097_SET_TEX_SAMPLER_POOL_B(p, sampler_pool_addr);
P_NV9097_SET_TEX_SAMPLER_POOL_C(p, qs->samplers.alloc_count - 1);
P_IMMD(p, NV9097, INVALIDATE_SAMPLER_CACHE_NO_WFI, {
.lines = LINES_ALL
});
}
}
if (qs->slm.mem) {
const uint64_t slm_addr = qs->slm.mem->va->addr;
const uint64_t slm_size = qs->slm.mem->size_B;
const uint64_t slm_per_warp = qs->slm.bytes_per_warp;
const uint64_t slm_per_tpc = qs->slm.bytes_per_tpc;
assert(!(slm_per_tpc & 0x7fff));
if (queue->engines & NVKMD_ENGINE_COMPUTE) {
P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_A);
P_NVA0C0_SET_SHADER_LOCAL_MEMORY_A(p, slm_addr >> 32);
P_NVA0C0_SET_SHADER_LOCAL_MEMORY_B(p, slm_addr);
P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A);
P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A(p, slm_per_tpc >> 32);
P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_B(p, slm_per_tpc);
P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_C(p, 0xff);
if (pdev->info.cls_compute < VOLTA_COMPUTE_A) {
P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_THROTTLED_A);
P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_A(p, slm_per_tpc >> 32);
P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_B(p, slm_per_tpc);
P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_C(p, 0xff);
}
}
if (queue->engines & NVKMD_ENGINE_3D) {
P_MTHD(p, NV9097, SET_SHADER_LOCAL_MEMORY_A);
P_NV9097_SET_SHADER_LOCAL_MEMORY_A(p, slm_addr >> 32);
P_NV9097_SET_SHADER_LOCAL_MEMORY_B(p, slm_addr);
P_NV9097_SET_SHADER_LOCAL_MEMORY_C(p, slm_size >> 32);
P_NV9097_SET_SHADER_LOCAL_MEMORY_D(p, slm_size);
P_NV9097_SET_SHADER_LOCAL_MEMORY_E(p, slm_per_warp);
}
}
return nvk_queue_push(queue, p);
}
static VkResult
nvk_queue_submit_bind(struct nvk_queue *queue,
struct vk_queue_submit *submit)
{
VkResult result;
result = nvkmd_ctx_wait(queue->bind_ctx, &queue->vk.base,
submit->wait_count, submit->waits);
if (result != VK_SUCCESS)
return result;
for (uint32_t i = 0; i < submit->buffer_bind_count; i++) {
result = nvk_queue_buffer_bind(queue, &submit->buffer_binds[i]);
if (result != VK_SUCCESS)
return result;
}
for (uint32_t i = 0; i < submit->image_bind_count; i++) {
result = nvk_queue_image_bind(queue, &submit->image_binds[i]);
if (result != VK_SUCCESS)
return result;
}
for (uint32_t i = 0; i < submit->image_opaque_bind_count; i++) {
result = nvk_queue_image_opaque_bind(queue, &submit->image_opaque_binds[i]);
if (result != VK_SUCCESS)
return result;
}
result = nvkmd_ctx_signal(queue->bind_ctx, &queue->vk.base,
submit->signal_count, submit->signals);
if (result != VK_SUCCESS)
return result;
return VK_SUCCESS;
}
static VkResult
nvk_queue_submit_exec(struct nvk_queue *queue,
struct vk_queue_submit *submit)
{
struct nvk_device *dev = nvk_queue_device(queue);
VkResult result;
if (submit->command_buffer_count > 0) {
result = nvk_queue_state_update(queue, &queue->state);
if (result != VK_SUCCESS)
return result;
uint64_t upload_time_point;
result = nvk_upload_queue_flush(dev, &dev->upload, &upload_time_point);
if (result != VK_SUCCESS)
return result;
if (upload_time_point > 0) {
struct vk_sync_wait wait = {
.sync = dev->upload.stream.sync,
.stage_mask = ~0,
.wait_value = upload_time_point,
};
result = nvkmd_ctx_wait(queue->exec_ctx, &queue->vk.base, 1, &wait);
if (result != VK_SUCCESS)
goto fail;
}
}
result = nvkmd_ctx_wait(queue->exec_ctx, &queue->vk.base,
submit->wait_count, submit->waits);
if (result != VK_SUCCESS)
goto fail;
for (unsigned i = 0; i < submit->command_buffer_count; i++) {
struct nvk_cmd_buffer *cmd =
container_of(submit->command_buffers[i], struct nvk_cmd_buffer, vk);
const uint32_t max_execs =
util_dynarray_num_elements(&cmd->pushes, struct nvk_cmd_push);
STACK_ARRAY(struct nvkmd_ctx_exec, execs, max_execs);
uint32_t exec_count = 0;
util_dynarray_foreach(&cmd->pushes, struct nvk_cmd_push, push) {
if (push->range == 0)
continue;
execs[exec_count++] = (struct nvkmd_ctx_exec) {
.addr = push->addr,
.size_B = push->range,
.incomplete = push->incomplete,
.no_prefetch = push->no_prefetch,
};
}
result = nvkmd_ctx_exec(queue->exec_ctx, &queue->vk.base,
exec_count, execs);
STACK_ARRAY_FINISH(execs);
if (result != VK_SUCCESS)
goto fail;
}
result = nvkmd_ctx_signal(queue->exec_ctx, &queue->vk.base,
submit->signal_count, submit->signals);
if (result != VK_SUCCESS)
goto fail;
fail:
return result;
}
static VkResult
nvk_queue_submit(struct vk_queue *vk_queue,
struct vk_queue_submit *submit)
{
struct nvk_queue *queue = container_of(vk_queue, struct nvk_queue, vk);
VkResult result;
if (vk_queue_is_lost(&queue->vk))
return VK_ERROR_DEVICE_LOST;
if (submit->buffer_bind_count > 0 ||
submit->image_bind_count > 0 ||
submit->image_opaque_bind_count > 0) {
assert(submit->command_buffer_count == 0);
result = nvk_queue_submit_bind(queue, submit);
if (result != VK_SUCCESS)
return vk_queue_set_lost(&queue->vk, "Bind operation failed");
} else {
result = nvk_queue_submit_exec(queue, submit);
if (result != VK_SUCCESS)
return vk_queue_set_lost(&queue->vk, "Submit failed");
}
return VK_SUCCESS;
}
static VkResult
nvk_queue_push(struct nvk_queue *queue, const struct nv_push *push)
{
struct nvk_device *dev = nvk_queue_device(queue);
if (vk_queue_is_lost(&queue->vk))
return VK_ERROR_DEVICE_LOST;
if (nv_push_dw_count(push) == 0)
return VK_SUCCESS;
return nvk_mem_stream_push(dev, &queue->push_stream, queue->exec_ctx,
push->start, nv_push_dw_count(push), NULL);
}
static VkResult
nvk_queue_init_context_state(struct nvk_queue *queue)
{
struct nvk_device *dev = nvk_queue_device(queue);
const struct nvk_physical_device *pdev = nvk_device_physical(dev);
VkResult result;
uint32_t push_data[4096];
struct nv_push push;
nv_push_init(&push, push_data, ARRAY_SIZE(push_data),
nvk_queue_subchannels_from_engines(queue->engines));
struct nv_push *p = &push;
/* M2MF state */
if (pdev->info.cls_m2mf <= FERMI_MEMORY_TO_MEMORY_FORMAT_A) {
/* we absolutely do not support Fermi, but if somebody wants to toy
* around with it, this is a must
*/
P_MTHD(p, NV9039, SET_OBJECT);
P_NV9039_SET_OBJECT(p, {
.class_id = pdev->info.cls_m2mf,
.engine_id = 0,
});
}
if (queue->engines & NVKMD_ENGINE_3D) {
result = nvk_push_draw_state_init(queue, p);
if (result != VK_SUCCESS)
return result;
}
if (queue->engines & NVKMD_ENGINE_COMPUTE) {
result = nvk_push_dispatch_state_init(queue, p);
if (result != VK_SUCCESS)
return result;
}
return nvk_queue_push(queue, &push);
}
static VkQueueGlobalPriority
get_queue_global_priority(const VkDeviceQueueCreateInfo *pCreateInfo)
{
const VkDeviceQueueGlobalPriorityCreateInfo *priority_info =
vk_find_struct_const(pCreateInfo->pNext,
DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO);
if (priority_info == NULL)
return VK_QUEUE_GLOBAL_PRIORITY_MEDIUM;
return priority_info->globalPriority;
}
VkResult
nvk_queue_create(struct nvk_device *dev,
const VkDeviceQueueCreateInfo *pCreateInfo,
uint32_t index_in_family)
{
const struct nvk_physical_device *pdev = nvk_device_physical(dev);
VkResult result;
assert(pCreateInfo->queueFamilyIndex < pdev->queue_family_count);
const struct nvk_queue_family *queue_family =
&pdev->queue_families[pCreateInfo->queueFamilyIndex];
const VkQueueGlobalPriority global_priority =
get_queue_global_priority(pCreateInfo);
/* From the Vulkan 1.3.295 spec:
*
* "If the globalPriorityQuery feature is enabled and the requested
* global priority is not reported via
* VkQueueFamilyGlobalPriorityPropertiesKHR, the driver implementation
* must fail the queue creation. In this scenario,
* VK_ERROR_INITIALIZATION_FAILED is returned."
*/
if (dev->vk.enabled_features.globalPriorityQuery &&
global_priority != VK_QUEUE_GLOBAL_PRIORITY_MEDIUM)
return VK_ERROR_INITIALIZATION_FAILED;
if (global_priority > VK_QUEUE_GLOBAL_PRIORITY_MEDIUM)
return VK_ERROR_NOT_PERMITTED;
struct nvk_queue *queue = vk_zalloc(&dev->vk.alloc, sizeof(struct nvk_queue),
8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (!queue)
return VK_ERROR_OUT_OF_HOST_MEMORY;
result = vk_queue_init(&queue->vk, &dev->vk, pCreateInfo, index_in_family);
if (result != VK_SUCCESS)
goto fail_alloc;
nvk_queue_state_init(&queue->state);
queue->engines =
nvk_queue_engines_from_queue_flags(queue_family->queue_flags);
if (queue->engines) {
result = nvkmd_dev_create_ctx(dev->nvkmd, &dev->vk.base,
queue->engines, &queue->exec_ctx);
if (result != VK_SUCCESS)
goto fail_init;
result = nvkmd_dev_alloc_mem(dev->nvkmd, &dev->vk.base,
4096, 0, NVKMD_MEM_LOCAL,
&queue->draw_cb0);
if (result != VK_SUCCESS)
goto fail_exec_ctx;
result = nvk_upload_queue_fill(dev, &dev->upload,
queue->draw_cb0->va->addr, 0,
queue->draw_cb0->size_B);
if (result != VK_SUCCESS)
goto fail_draw_cb0;
}
if (queue_family->queue_flags & VK_QUEUE_SPARSE_BINDING_BIT) {
result = nvkmd_dev_create_ctx(dev->nvkmd, &dev->vk.base,
NVKMD_ENGINE_BIND, &queue->bind_ctx);
if (result != VK_SUCCESS)
goto fail_draw_cb0;
}
result = nvk_mem_stream_init(dev, &queue->push_stream);
if (result != VK_SUCCESS)
goto fail_bind_ctx;
result = nvk_queue_init_context_state(queue);
if (result != VK_SUCCESS)
goto fail_push_stream;
queue->vk.driver_submit = nvk_queue_submit;
return VK_SUCCESS;
fail_push_stream:
nvk_mem_stream_sync(dev, &queue->push_stream, queue->exec_ctx);
nvk_mem_stream_finish(dev, &queue->push_stream);
fail_bind_ctx:
if (queue->bind_ctx != NULL)
nvkmd_ctx_destroy(queue->bind_ctx);
fail_draw_cb0:
if (queue->draw_cb0 != NULL)
nvkmd_mem_unref(queue->draw_cb0);
fail_exec_ctx:
if (queue->exec_ctx != NULL)
nvkmd_ctx_destroy(queue->exec_ctx);
fail_init:
nvk_queue_state_finish(dev, &queue->state);
vk_queue_finish(&queue->vk);
fail_alloc:
vk_free(&dev->vk.alloc, queue);
return result;
}
void
nvk_queue_destroy(struct nvk_device *dev, struct nvk_queue *queue)
{
nvk_mem_stream_sync(dev, &queue->push_stream, queue->exec_ctx);
nvk_mem_stream_finish(dev, &queue->push_stream);
if (queue->draw_cb0 != NULL) {
nvk_upload_queue_sync(dev, &dev->upload);
nvkmd_mem_unref(queue->draw_cb0);
}
nvk_queue_state_finish(dev, &queue->state);
if (queue->bind_ctx != NULL)
nvkmd_ctx_destroy(queue->bind_ctx);
if (queue->exec_ctx != NULL)
nvkmd_ctx_destroy(queue->exec_ctx);
vk_queue_finish(&queue->vk);
vk_free(&dev->vk.alloc, queue);
}