| /* |
| * Copyright 2024 Valve Corporation |
| * Copyright 2024 Alyssa Rosenzweig |
| * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. |
| * SPDX-License-Identifier: MIT |
| */ |
| #include "hk_cmd_buffer.h" |
| |
| #include "agx_abi.h" |
| #include "agx_bo.h" |
| #include "agx_device.h" |
| #include "agx_linker.h" |
| #include "agx_tilebuffer.h" |
| #include "agx_usc.h" |
| #include "hk_buffer.h" |
| #include "hk_cmd_pool.h" |
| #include "hk_descriptor_set.h" |
| #include "hk_descriptor_set_layout.h" |
| #include "hk_device.h" |
| #include "hk_device_memory.h" |
| #include "hk_entrypoints.h" |
| #include "hk_image.h" |
| #include "hk_image_view.h" |
| #include "hk_physical_device.h" |
| #include "hk_shader.h" |
| |
| #include "libagx_dgc.h" |
| #include "pool.h" |
| #include "shader_enums.h" |
| #include "vk_pipeline_layout.h" |
| #include "vk_synchronization.h" |
| |
| #include "util/list.h" |
| #include "util/macros.h" |
| #include "util/u_dynarray.h" |
| #include "vulkan/vulkan_core.h" |
| |
| static void |
| hk_descriptor_state_fini(struct hk_cmd_buffer *cmd, |
| struct hk_descriptor_state *desc) |
| { |
| struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd); |
| |
| for (unsigned i = 0; i < HK_MAX_SETS; i++) { |
| vk_free(&pool->vk.alloc, desc->push[i]); |
| desc->push[i] = NULL; |
| } |
| } |
| |
| static void |
| hk_free_resettable_cmd_buffer(struct hk_cmd_buffer *cmd) |
| { |
| struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd); |
| struct hk_device *dev = hk_cmd_pool_device(pool); |
| |
| hk_descriptor_state_fini(cmd, &cmd->state.gfx.descriptors); |
| hk_descriptor_state_fini(cmd, &cmd->state.cs.descriptors); |
| |
| hk_cmd_pool_free_bo_list(pool, &cmd->uploader.main.bos); |
| hk_cmd_pool_free_usc_bo_list(pool, &cmd->uploader.usc.bos); |
| |
| list_for_each_entry_safe(struct hk_cs, it, &cmd->control_streams, node) { |
| list_del(&it->node); |
| hk_cs_destroy(it); |
| } |
| |
| util_dynarray_foreach(&cmd->large_bos, struct agx_bo *, bo) { |
| agx_bo_unreference(&dev->dev, *bo); |
| } |
| |
| util_dynarray_clear(&cmd->large_bos); |
| } |
| |
| static void |
| hk_destroy_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer) |
| { |
| struct hk_cmd_buffer *cmd = |
| container_of(vk_cmd_buffer, struct hk_cmd_buffer, vk); |
| struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd); |
| |
| util_dynarray_fini(&cmd->large_bos); |
| hk_free_resettable_cmd_buffer(cmd); |
| vk_command_buffer_finish(&cmd->vk); |
| vk_free(&pool->vk.alloc, cmd); |
| } |
| |
| static VkResult |
| hk_create_cmd_buffer(struct vk_command_pool *vk_pool, |
| VkCommandBufferLevel level, |
| struct vk_command_buffer **cmd_buffer_out) |
| { |
| struct hk_cmd_pool *pool = container_of(vk_pool, struct hk_cmd_pool, vk); |
| struct hk_device *dev = hk_cmd_pool_device(pool); |
| struct hk_cmd_buffer *cmd; |
| VkResult result; |
| |
| cmd = vk_zalloc(&pool->vk.alloc, sizeof(*cmd), 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); |
| if (cmd == NULL) |
| return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); |
| |
| result = |
| vk_command_buffer_init(&pool->vk, &cmd->vk, &hk_cmd_buffer_ops, level); |
| if (result != VK_SUCCESS) { |
| vk_free(&pool->vk.alloc, cmd); |
| return result; |
| } |
| |
| util_dynarray_init(&cmd->large_bos, NULL); |
| |
| cmd->vk.dynamic_graphics_state.vi = &cmd->state.gfx._dynamic_vi; |
| cmd->vk.dynamic_graphics_state.ms.sample_locations = |
| &cmd->state.gfx._dynamic_sl; |
| |
| list_inithead(&cmd->uploader.main.bos); |
| list_inithead(&cmd->uploader.usc.bos); |
| list_inithead(&cmd->control_streams); |
| |
| *cmd_buffer_out = &cmd->vk; |
| |
| return VK_SUCCESS; |
| } |
| |
| static void |
| hk_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, |
| UNUSED VkCommandBufferResetFlags flags) |
| { |
| struct hk_cmd_buffer *cmd = |
| container_of(vk_cmd_buffer, struct hk_cmd_buffer, vk); |
| |
| vk_command_buffer_reset(&cmd->vk); |
| hk_free_resettable_cmd_buffer(cmd); |
| |
| cmd->uploader.main.map = NULL; |
| cmd->uploader.main.base = 0; |
| cmd->uploader.main.offset = 0; |
| cmd->uploader.usc.map = NULL; |
| cmd->uploader.usc.base = 0; |
| cmd->uploader.usc.offset = 0; |
| |
| cmd->current_cs.gfx = NULL; |
| cmd->current_cs.cs = NULL; |
| cmd->current_cs.post_gfx = NULL; |
| cmd->current_cs.pre_gfx = NULL; |
| |
| /* TODO: clear pool! */ |
| |
| memset(&cmd->state, 0, sizeof(cmd->state)); |
| } |
| |
| const struct vk_command_buffer_ops hk_cmd_buffer_ops = { |
| .create = hk_create_cmd_buffer, |
| .reset = hk_reset_cmd_buffer, |
| .destroy = hk_destroy_cmd_buffer, |
| }; |
| |
| static VkResult |
| hk_cmd_buffer_alloc_bo(struct hk_cmd_buffer *cmd, bool usc, |
| struct hk_cmd_bo **bo_out) |
| { |
| VkResult result = hk_cmd_pool_alloc_bo(hk_cmd_buffer_pool(cmd), usc, bo_out); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| if (usc) |
| list_addtail(&(*bo_out)->link, &cmd->uploader.usc.bos); |
| else |
| list_addtail(&(*bo_out)->link, &cmd->uploader.main.bos); |
| |
| return VK_SUCCESS; |
| } |
| |
| struct agx_ptr |
| hk_pool_alloc_internal(struct hk_cmd_buffer *cmd, uint32_t size, |
| uint32_t alignment, bool usc) |
| { |
| struct hk_device *dev = hk_cmd_buffer_device(cmd); |
| struct hk_uploader *uploader = |
| usc ? &cmd->uploader.usc : &cmd->uploader.main; |
| |
| /* Specially handle large allocations owned by the command buffer, e.g. used |
| * for statically allocated vertex output buffers with geometry shaders. |
| */ |
| if (size > HK_CMD_BO_SIZE) { |
| uint32_t flags = usc ? AGX_BO_LOW_VA : 0; |
| struct agx_bo *bo = |
| agx_bo_create(&dev->dev, size, flags, 0, "Large pool allocation"); |
| |
| util_dynarray_append(&cmd->large_bos, struct agx_bo *, bo); |
| return (struct agx_ptr){ |
| .gpu = bo->va->addr, |
| .cpu = agx_bo_map(bo), |
| }; |
| } |
| |
| assert(size <= HK_CMD_BO_SIZE); |
| assert(alignment > 0); |
| |
| uint32_t offset = align(uploader->offset, alignment); |
| |
| assert(offset <= HK_CMD_BO_SIZE); |
| if (uploader->map != NULL && size <= HK_CMD_BO_SIZE - offset) { |
| uploader->offset = offset + size; |
| |
| return (struct agx_ptr){ |
| .gpu = uploader->base + offset, |
| .cpu = uploader->map + offset, |
| }; |
| } |
| |
| struct hk_cmd_bo *bo; |
| VkResult result = hk_cmd_buffer_alloc_bo(cmd, usc, &bo); |
| if (unlikely(result != VK_SUCCESS)) { |
| vk_command_buffer_set_error(&cmd->vk, result); |
| return (struct agx_ptr){0}; |
| } |
| |
| /* Pick whichever of the current upload BO and the new BO will have more |
| * room left to be the BO for the next upload. If our upload size is |
| * bigger than the old offset, we're better off burning the whole new |
| * upload BO on this one allocation and continuing on the current upload |
| * BO. |
| */ |
| if (uploader->map == NULL || size < uploader->offset) { |
| uploader->map = agx_bo_map(bo->bo); |
| uploader->base = bo->bo->va->addr; |
| uploader->offset = size; |
| } |
| |
| return (struct agx_ptr){ |
| .gpu = bo->bo->va->addr, |
| .cpu = bo->map, |
| }; |
| } |
| |
| uint64_t |
| hk_pool_upload(struct hk_cmd_buffer *cmd, const void *data, uint32_t size, |
| uint32_t alignment) |
| { |
| struct agx_ptr T = hk_pool_alloc(cmd, size, alignment); |
| if (unlikely(T.cpu == NULL)) |
| return 0; |
| |
| memcpy(T.cpu, data, size); |
| return T.gpu; |
| } |
| |
| VKAPI_ATTR VkResult VKAPI_CALL |
| hk_BeginCommandBuffer(VkCommandBuffer commandBuffer, |
| const VkCommandBufferBeginInfo *pBeginInfo) |
| { |
| VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); |
| hk_reset_cmd_buffer(&cmd->vk, 0); |
| |
| perf_debug(cmd, "Begin command buffer"); |
| hk_cmd_buffer_begin_compute(cmd, pBeginInfo); |
| hk_cmd_buffer_begin_graphics(cmd, pBeginInfo); |
| |
| return VK_SUCCESS; |
| } |
| |
| /* |
| * Merge adjacent compute control streams. Except for reading timestamps, there |
| * is no reason to submit two CDM streams back-to-back in the same command |
| * buffer. However, it is challenging to avoid constructing such sequences due |
| * to the gymnastics required to reorder compute around graphics. Merging at |
| * EndCommandBuffer is cheap O(# of control streams) and lets us get away with |
| * the sloppiness. |
| */ |
| static void |
| merge_control_streams(struct hk_cmd_buffer *cmd) |
| { |
| struct hk_cs *last = NULL; |
| |
| list_for_each_entry_safe(struct hk_cs, cs, &cmd->control_streams, node) { |
| if (cs->type == HK_CS_CDM && last && last->type == HK_CS_CDM && |
| !last->timestamp.end.handle) { |
| |
| hk_cs_merge_cdm(last, cs); |
| list_del(&cs->node); |
| hk_cs_destroy(cs); |
| } else { |
| last = cs; |
| } |
| } |
| } |
| |
| VKAPI_ATTR VkResult VKAPI_CALL |
| hk_EndCommandBuffer(VkCommandBuffer commandBuffer) |
| { |
| VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); |
| |
| assert(cmd->current_cs.gfx == NULL && cmd->current_cs.pre_gfx == NULL && |
| "must end rendering before ending the command buffer"); |
| |
| perf_debug(cmd, "End command buffer"); |
| hk_cmd_buffer_end_compute(cmd); |
| hk_cmd_buffer_end_compute_internal(cmd, &cmd->current_cs.post_gfx); |
| |
| struct hk_device *dev = hk_cmd_buffer_device(cmd); |
| if (likely(!(dev->dev.debug & AGX_DBG_NOMERGE))) { |
| merge_control_streams(cmd); |
| } |
| |
| /* We cannot terminate CDM control streams until after merging, since merging |
| * needs to append stream links late. Now that we've merged, insert all the |
| * missing stream terminates. |
| */ |
| list_for_each_entry(struct hk_cs, cs, &cmd->control_streams, node) { |
| if (cs->type == HK_CS_CDM) { |
| cs->current = agx_cdm_terminate(cs->current); |
| } |
| } |
| |
| return vk_command_buffer_get_record_result(&cmd->vk); |
| } |
| |
| VKAPI_ATTR void VKAPI_CALL |
| hk_CmdPipelineBarrier2(VkCommandBuffer commandBuffer, |
| const VkDependencyInfo *pDependencyInfo) |
| { |
| VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); |
| struct hk_device *dev = hk_cmd_buffer_device(cmd); |
| |
| if (HK_PERF(dev, NOBARRIER)) |
| return; |
| |
| perf_debug(cmd, "Pipeline barrier"); |
| |
| /* The big hammer. We end both compute and graphics batches. Ending compute |
| * here is necessary to properly handle graphics->compute dependencies. |
| * |
| * XXX: perf. */ |
| hk_cmd_buffer_end_compute(cmd); |
| hk_cmd_buffer_end_graphics(cmd); |
| } |
| |
| void |
| hk_cmd_bind_shaders(struct vk_command_buffer *vk_cmd, uint32_t stage_count, |
| const gl_shader_stage *stages, |
| struct vk_shader **const shaders) |
| { |
| struct hk_cmd_buffer *cmd = container_of(vk_cmd, struct hk_cmd_buffer, vk); |
| |
| for (uint32_t i = 0; i < stage_count; i++) { |
| struct hk_api_shader *shader = |
| container_of(shaders[i], struct hk_api_shader, vk); |
| |
| if (stages[i] == MESA_SHADER_COMPUTE || stages[i] == MESA_SHADER_KERNEL) |
| hk_cmd_bind_compute_shader(cmd, shader); |
| else |
| hk_cmd_bind_graphics_shader(cmd, stages[i], shader); |
| } |
| } |
| |
| static void |
| hk_bind_descriptor_sets(UNUSED struct hk_cmd_buffer *cmd, |
| struct hk_descriptor_state *desc, |
| const VkBindDescriptorSetsInfoKHR *info) |
| { |
| VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout, info->layout); |
| |
| /* Fro the Vulkan 1.3.275 spec: |
| * |
| * "When binding a descriptor set (see Descriptor Set Binding) to |
| * set number N... |
| * |
| * If, additionally, the previously bound descriptor set for set |
| * N was bound using a pipeline layout not compatible for set N, |
| * then all bindings in sets numbered greater than N are |
| * disturbed." |
| * |
| * This means that, if some earlier set gets bound in such a way that |
| * it changes set_dynamic_buffer_start[s], this binding is implicitly |
| * invalidated. Therefore, we can always look at the current value |
| * of set_dynamic_buffer_start[s] as the base of our dynamic buffer |
| * range and it's only our responsibility to adjust all |
| * set_dynamic_buffer_start[p] for p > s as needed. |
| */ |
| uint8_t dyn_buffer_start = |
| desc->root.set_dynamic_buffer_start[info->firstSet]; |
| |
| uint32_t next_dyn_offset = 0; |
| for (uint32_t i = 0; i < info->descriptorSetCount; ++i) { |
| unsigned s = i + info->firstSet; |
| VK_FROM_HANDLE(hk_descriptor_set, set, info->pDescriptorSets[i]); |
| |
| if (desc->sets[s] != set) { |
| if (set != NULL) { |
| desc->root.sets[s] = hk_descriptor_set_addr(set); |
| desc->set_sizes[s] = set->size; |
| } else { |
| desc->root.sets[s] = 0; |
| desc->set_sizes[s] = 0; |
| } |
| desc->sets[s] = set; |
| desc->sets_dirty |= BITFIELD_BIT(s); |
| |
| /* Binding descriptors invalidates push descriptors */ |
| desc->push_dirty &= ~BITFIELD_BIT(s); |
| } |
| |
| desc->root.set_dynamic_buffer_start[s] = dyn_buffer_start; |
| |
| if (pipeline_layout->set_layouts[s] != NULL) { |
| const struct hk_descriptor_set_layout *set_layout = |
| vk_to_hk_descriptor_set_layout(pipeline_layout->set_layouts[s]); |
| |
| if (set != NULL && set_layout->dynamic_buffer_count > 0) { |
| for (uint32_t j = 0; j < set_layout->dynamic_buffer_count; j++) { |
| struct hk_buffer_address addr = set->dynamic_buffers[j]; |
| addr.base_addr += info->pDynamicOffsets[next_dyn_offset + j]; |
| desc->root.dynamic_buffers[dyn_buffer_start + j] = addr; |
| } |
| next_dyn_offset += set->layout->dynamic_buffer_count; |
| } |
| |
| dyn_buffer_start += set_layout->dynamic_buffer_count; |
| } else { |
| assert(set == NULL); |
| } |
| } |
| assert(dyn_buffer_start <= HK_MAX_DYNAMIC_BUFFERS); |
| assert(next_dyn_offset <= info->dynamicOffsetCount); |
| |
| for (uint32_t s = info->firstSet + info->descriptorSetCount; s < HK_MAX_SETS; |
| s++) |
| desc->root.set_dynamic_buffer_start[s] = dyn_buffer_start; |
| |
| desc->root_dirty = true; |
| } |
| |
| VKAPI_ATTR void VKAPI_CALL |
| hk_CmdBindDescriptorSets2KHR( |
| VkCommandBuffer commandBuffer, |
| const VkBindDescriptorSetsInfoKHR *pBindDescriptorSetsInfo) |
| { |
| VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); |
| |
| if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) { |
| hk_bind_descriptor_sets(cmd, &cmd->state.gfx.descriptors, |
| pBindDescriptorSetsInfo); |
| } |
| |
| if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) { |
| hk_bind_descriptor_sets(cmd, &cmd->state.cs.descriptors, |
| pBindDescriptorSetsInfo); |
| } |
| } |
| |
| static void |
| hk_push_constants(UNUSED struct hk_cmd_buffer *cmd, |
| struct hk_descriptor_state *desc, |
| const VkPushConstantsInfoKHR *info) |
| { |
| memcpy(desc->root.push + info->offset, info->pValues, info->size); |
| desc->root_dirty = true; |
| } |
| |
| VKAPI_ATTR void VKAPI_CALL |
| hk_CmdPushConstants2KHR(VkCommandBuffer commandBuffer, |
| const VkPushConstantsInfoKHR *pPushConstantsInfo) |
| { |
| VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); |
| |
| if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) |
| hk_push_constants(cmd, &cmd->state.gfx.descriptors, pPushConstantsInfo); |
| |
| if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) |
| hk_push_constants(cmd, &cmd->state.cs.descriptors, pPushConstantsInfo); |
| } |
| |
| static struct hk_push_descriptor_set * |
| hk_cmd_push_descriptors(struct hk_cmd_buffer *cmd, |
| struct hk_descriptor_state *desc, uint32_t set) |
| { |
| assert(set < HK_MAX_SETS); |
| if (unlikely(desc->push[set] == NULL)) { |
| desc->push[set] = |
| vk_zalloc(&cmd->vk.pool->alloc, sizeof(*desc->push[set]), 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); |
| if (unlikely(desc->push[set] == NULL)) { |
| vk_command_buffer_set_error(&cmd->vk, VK_ERROR_OUT_OF_HOST_MEMORY); |
| return NULL; |
| } |
| } |
| |
| /* Pushing descriptors replaces whatever sets are bound */ |
| desc->sets[set] = NULL; |
| desc->push_dirty |= BITFIELD_BIT(set); |
| |
| return desc->push[set]; |
| } |
| |
| static void |
| hk_push_descriptor_set(struct hk_cmd_buffer *cmd, |
| struct hk_descriptor_state *desc, |
| const VkPushDescriptorSetInfoKHR *info) |
| { |
| VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout, info->layout); |
| |
| struct hk_push_descriptor_set *push_set = |
| hk_cmd_push_descriptors(cmd, desc, info->set); |
| if (unlikely(push_set == NULL)) |
| return; |
| |
| struct hk_descriptor_set_layout *set_layout = |
| vk_to_hk_descriptor_set_layout(pipeline_layout->set_layouts[info->set]); |
| |
| hk_push_descriptor_set_update(push_set, set_layout, |
| info->descriptorWriteCount, |
| info->pDescriptorWrites); |
| } |
| |
| VKAPI_ATTR void VKAPI_CALL |
| hk_CmdPushDescriptorSet2KHR( |
| VkCommandBuffer commandBuffer, |
| const VkPushDescriptorSetInfoKHR *pPushDescriptorSetInfo) |
| { |
| VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); |
| |
| if (pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) { |
| hk_push_descriptor_set(cmd, &cmd->state.gfx.descriptors, |
| pPushDescriptorSetInfo); |
| } |
| |
| if (pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) { |
| hk_push_descriptor_set(cmd, &cmd->state.cs.descriptors, |
| pPushDescriptorSetInfo); |
| } |
| } |
| |
| void |
| hk_cmd_buffer_flush_push_descriptors(struct hk_cmd_buffer *cmd, |
| struct hk_descriptor_state *desc) |
| { |
| u_foreach_bit(set_idx, desc->push_dirty) { |
| struct hk_push_descriptor_set *push_set = desc->push[set_idx]; |
| uint64_t push_set_addr = hk_pool_upload( |
| cmd, push_set->data, sizeof(push_set->data), HK_MIN_UBO_ALIGNMENT); |
| |
| desc->root.sets[set_idx] = push_set_addr; |
| desc->set_sizes[set_idx] = sizeof(push_set->data); |
| } |
| |
| desc->root_dirty = true; |
| desc->push_dirty = 0; |
| } |
| |
| VKAPI_ATTR void VKAPI_CALL |
| hk_CmdPushDescriptorSetWithTemplate2KHR( |
| VkCommandBuffer commandBuffer, const VkPushDescriptorSetWithTemplateInfoKHR |
| *pPushDescriptorSetWithTemplateInfo) |
| { |
| VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); |
| VK_FROM_HANDLE(vk_descriptor_update_template, template, |
| pPushDescriptorSetWithTemplateInfo->descriptorUpdateTemplate); |
| VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout, |
| pPushDescriptorSetWithTemplateInfo->layout); |
| |
| struct hk_descriptor_state *desc = |
| hk_get_descriptors_state(cmd, template->bind_point); |
| struct hk_push_descriptor_set *push_set = hk_cmd_push_descriptors( |
| cmd, desc, pPushDescriptorSetWithTemplateInfo->set); |
| if (unlikely(push_set == NULL)) |
| return; |
| |
| struct hk_descriptor_set_layout *set_layout = vk_to_hk_descriptor_set_layout( |
| pipeline_layout->set_layouts[pPushDescriptorSetWithTemplateInfo->set]); |
| |
| hk_push_descriptor_set_update_template( |
| push_set, set_layout, template, |
| pPushDescriptorSetWithTemplateInfo->pData); |
| } |
| |
| uint64_t |
| hk_cmd_buffer_upload_root(struct hk_cmd_buffer *cmd, |
| VkPipelineBindPoint bind_point) |
| { |
| struct hk_descriptor_state *desc = hk_get_descriptors_state(cmd, bind_point); |
| struct hk_root_descriptor_table *root = &desc->root; |
| |
| struct agx_ptr root_ptr = hk_pool_alloc(cmd, sizeof(*root), 8); |
| if (!root_ptr.gpu) |
| return 0; |
| |
| root->root_desc_addr = root_ptr.gpu; |
| |
| memcpy(root_ptr.cpu, root, sizeof(*root)); |
| return root_ptr.gpu; |
| } |
| |
| void |
| hk_usc_upload_spilled_rt_descs(struct agx_usc_builder *b, |
| struct hk_cmd_buffer *cmd) |
| { |
| struct hk_rendering_state *render = &cmd->state.gfx.render; |
| |
| /* Upload texture/PBE descriptors for each render target so we can clear |
| * spilled render targets. |
| */ |
| struct agx_ptr descs = |
| hk_pool_alloc(cmd, AGX_TEXTURE_LENGTH * 2 * render->color_att_count, 64); |
| struct agx_texture_packed *desc = descs.cpu; |
| if (!desc) |
| return; |
| |
| for (unsigned i = 0; i < render->color_att_count; ++i) { |
| struct hk_image_view *iview = render->color_att[i].iview; |
| if (!iview) { |
| /* XXX: probably should emit a null descriptor here...? */ |
| continue; |
| } |
| |
| memcpy(&desc[(i * 2) + 0], &iview->planes[0].emrt_texture, sizeof(*desc)); |
| memcpy(&desc[(i * 2) + 1], &iview->planes[0].emrt_pbe, sizeof(*desc)); |
| } |
| |
| desc = descs.cpu; |
| |
| /* Bind the base as u0_u1 for bindless access */ |
| agx_usc_uniform(b, 0, 4, hk_pool_upload(cmd, &descs.gpu, 8, 8)); |
| } |
| |
| void |
| hk_reserve_scratch(struct hk_cmd_buffer *cmd, struct hk_cs *cs, |
| struct hk_shader *s) |
| { |
| struct hk_device *dev = hk_cmd_buffer_device(cmd); |
| uint32_t max_scratch_size = |
| MAX2(s->b.info.scratch_size, s->b.info.preamble_scratch_size); |
| |
| if (max_scratch_size == 0) |
| return; |
| |
| unsigned preamble_size = (s->b.info.preamble_scratch_size > 0) ? 1 : 0; |
| |
| /* Note: this uses the hardware stage, not the software stage */ |
| hk_device_alloc_scratch(dev, s->b.info.stage, max_scratch_size); |
| perf_debug(cmd, "Reserving %u (%u) bytes of scratch for stage %s", |
| s->b.info.scratch_size, s->b.info.preamble_scratch_size, |
| _mesa_shader_stage_to_abbrev(s->b.info.stage)); |
| |
| switch (s->b.info.stage) { |
| case PIPE_SHADER_FRAGMENT: |
| cs->scratch.fs.main = true; |
| cs->scratch.fs.preamble = MAX2(cs->scratch.fs.preamble, preamble_size); |
| break; |
| case PIPE_SHADER_VERTEX: |
| cs->scratch.vs.main = true; |
| cs->scratch.vs.preamble = MAX2(cs->scratch.vs.preamble, preamble_size); |
| break; |
| default: |
| cs->scratch.cs.main = true; |
| cs->scratch.cs.preamble = MAX2(cs->scratch.cs.preamble, preamble_size); |
| break; |
| } |
| } |
| |
| uint32_t |
| hk_upload_usc_words(struct hk_cmd_buffer *cmd, struct hk_shader *s, |
| struct hk_linked_shader *linked) |
| { |
| struct hk_device *dev = hk_cmd_buffer_device(cmd); |
| |
| enum pipe_shader_type sw_stage = s->info.stage; |
| |
| unsigned constant_push_ranges = DIV_ROUND_UP(s->b.info.rodata.size_16, 64); |
| unsigned push_ranges = 2; |
| unsigned stage_ranges = 3; |
| |
| size_t usc_size = |
| agx_usc_size(constant_push_ranges + push_ranges + stage_ranges + 4); |
| struct agx_ptr t = hk_pool_usc_alloc(cmd, usc_size, 64); |
| if (!t.cpu) |
| return 0; |
| |
| struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size); |
| |
| uint64_t root_ptr; |
| |
| if (sw_stage == PIPE_SHADER_COMPUTE) |
| root_ptr = hk_cmd_buffer_upload_root(cmd, VK_PIPELINE_BIND_POINT_COMPUTE); |
| else |
| root_ptr = cmd->state.gfx.root; |
| |
| static_assert(offsetof(struct hk_root_descriptor_table, root_desc_addr) == 0, |
| "self-reflective"); |
| |
| unsigned root_unif = 0; |
| if (sw_stage == MESA_SHADER_VERTEX) { |
| unsigned count = |
| DIV_ROUND_UP(BITSET_LAST_BIT(s->info.vs.attrib_components_read), 4); |
| |
| if (count) { |
| agx_usc_uniform( |
| &b, AGX_ABI_VUNI_VBO_BASE(0), 4 * count, |
| root_ptr + hk_root_descriptor_offset(draw.attrib_base)); |
| |
| agx_usc_uniform( |
| &b, AGX_ABI_VUNI_VBO_CLAMP(count, 0), 2 * count, |
| root_ptr + hk_root_descriptor_offset(draw.attrib_clamps)); |
| } |
| |
| if (cmd->state.gfx.draw_params) { |
| agx_usc_uniform(&b, AGX_ABI_VUNI_FIRST_VERTEX(count), 4, |
| cmd->state.gfx.draw_params); |
| } |
| |
| if (cmd->state.gfx.draw_id_ptr) { |
| agx_usc_uniform(&b, AGX_ABI_VUNI_DRAW_ID(count), 1, |
| cmd->state.gfx.draw_id_ptr); |
| } |
| |
| if (linked->sw_indexing) { |
| agx_usc_uniform( |
| &b, AGX_ABI_VUNI_INPUT_ASSEMBLY(count), 4, |
| root_ptr + hk_root_descriptor_offset(draw.input_assembly)); |
| } |
| |
| root_unif = AGX_ABI_VUNI_COUNT_VK(count); |
| } else if (sw_stage == MESA_SHADER_FRAGMENT) { |
| if (agx_tilebuffer_spills(&cmd->state.gfx.render.tilebuffer)) { |
| hk_usc_upload_spilled_rt_descs(&b, cmd); |
| } |
| |
| if (cmd->state.gfx.uses_blend_constant) { |
| agx_usc_uniform( |
| &b, 4, 8, |
| root_ptr + hk_root_descriptor_offset(draw.blend_constant)); |
| } |
| |
| /* The SHARED state is baked into linked->usc for non-fragment shaders. We |
| * don't pass around the information to bake the tilebuffer layout. |
| * |
| * TODO: We probably could with some refactor. |
| */ |
| agx_usc_push_packed(&b, SHARED, &cmd->state.gfx.render.tilebuffer.usc); |
| root_unif = AGX_ABI_FUNI_ROOT; |
| } |
| |
| agx_usc_uniform(&b, root_unif, 4, root_ptr); |
| |
| agx_usc_push_blob(&b, linked->usc.data, linked->usc.size); |
| return agx_usc_addr(&dev->dev, t.gpu); |
| } |
| |
| void |
| hk_dispatch_precomp(struct hk_cmd_buffer *cmd, struct agx_grid grid, |
| enum agx_barrier barrier, enum libagx_program idx, |
| void *data, size_t data_size) |
| { |
| struct hk_device *dev = hk_cmd_buffer_device(cmd); |
| struct agx_precompiled_shader *prog = agx_get_precompiled(&dev->bg_eot, idx); |
| |
| struct hk_cs **target = (barrier & AGX_POSTGFX) ? &cmd->current_cs.post_gfx |
| : (barrier & AGX_PREGFX) ? &cmd->current_cs.pre_gfx |
| : &cmd->current_cs.cs; |
| |
| struct hk_cs *cs = hk_cmd_buffer_get_cs_general(cmd, target, true); |
| if (!cs) |
| return; |
| |
| struct agx_ptr t = hk_pool_usc_alloc(cmd, agx_usc_size(15), 64); |
| uint64_t uploaded_data = hk_pool_upload(cmd, data, data_size, 4); |
| |
| agx_usc_words_precomp(t.cpu, &prog->b, uploaded_data, data_size); |
| |
| hk_dispatch_with_usc_launch(dev, cs, prog->b.launch, |
| agx_usc_addr(&dev->dev, t.gpu), grid, |
| prog->b.workgroup); |
| } |
| |
| void |
| hk_cs_init_graphics(struct hk_cmd_buffer *cmd, struct hk_cs *cs) |
| { |
| struct hk_rendering_state *render = &cmd->state.gfx.render; |
| uint8_t *map = cs->current; |
| |
| cs->tib = render->tilebuffer; |
| |
| /* Assume this is not the first control stream of the render pass, so |
| * initially use the partial background/EOT program and ZLS control. |
| * hk_BeginRendering/hk_EndRendering will override. |
| */ |
| cs->cr = render->cr; |
| cs->cr.bg.main = render->cr.bg.partial; |
| cs->cr.eot.main = render->cr.eot.partial; |
| cs->cr.zls_control = render->cr.zls_control_partial; |
| |
| /* Barrier to enforce GPU-CPU coherency, in case this batch is back to back |
| * with another that caused stale data to be cached and the CPU wrote to it |
| * in the meantime. |
| */ |
| agx_push(map, VDM_BARRIER, cfg) { |
| cfg.usc_cache_inval = true; |
| } |
| |
| struct AGX_PPP_HEADER present = { |
| .w_clamp = true, |
| .occlusion_query_2 = true, |
| .output_unknown = true, |
| .varying_word_2 = true, |
| .viewport_count = 1, /* irrelevant */ |
| }; |
| |
| size_t size = agx_ppp_update_size(&present); |
| struct agx_ptr T = hk_pool_alloc(cmd, size, 64); |
| if (!T.cpu) |
| return; |
| |
| struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &present); |
| |
| /* clang-format off */ |
| agx_ppp_push(&ppp, W_CLAMP, cfg) cfg.w_clamp = 1e-10; |
| agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY_2, cfg); |
| agx_ppp_push(&ppp, OUTPUT_UNKNOWN, cfg); |
| agx_ppp_push(&ppp, VARYING_2, cfg); |
| /* clang-format on */ |
| |
| agx_ppp_fini(&map, &ppp); |
| cs->current = map; |
| |
| util_dynarray_init(&cs->scissor, NULL); |
| util_dynarray_init(&cs->depth_bias, NULL); |
| |
| /* All graphics state must be reemited in each control stream */ |
| hk_cmd_buffer_dirty_all(cmd); |
| } |
| |
| void |
| hk_ensure_cs_has_space(struct hk_cmd_buffer *cmd, struct hk_cs *cs, |
| size_t space) |
| { |
| bool vdm = cs->type == HK_CS_VDM; |
| |
| size_t link_length = |
| vdm ? AGX_VDM_STREAM_LINK_LENGTH : AGX_CDM_STREAM_LINK_LENGTH; |
| |
| /* Assert that we have space for a link tag */ |
| assert((cs->current + link_length) <= cs->end && "Encoder overflowed"); |
| |
| /* Always leave room for a link tag, in case we run out of space later, |
| * plus padding because VDM apparently overreads? |
| * |
| * 0x200 is not enough. 0x400 seems to work. 0x800 for safety. |
| */ |
| space += link_length + 0x800; |
| |
| /* If there is room in the command buffer, we're done */ |
| if (likely((cs->end - cs->current) >= space)) |
| return; |
| |
| /* Otherwise, we need to allocate a new command buffer. We use memory owned |
| * by the batch to simplify lifetime management for the BO. |
| */ |
| size_t size = 65536; |
| struct agx_ptr T = hk_pool_alloc(cmd, size, 256); |
| |
| /* Jump from the old control stream to the new control stream */ |
| agx_cs_jump(cs->current, T.gpu, vdm); |
| |
| /* Swap out the control stream */ |
| cs->current = T.cpu; |
| cs->end = cs->current + size; |
| cs->chunk = T; |
| cs->stream_linked = true; |
| } |
| |
| static void |
| clear_attachment_as_image(struct hk_cmd_buffer *cmd, |
| struct hk_rendering_state *render, |
| struct hk_attachment *att, unsigned aspect) |
| { |
| struct hk_image_view *view = att->iview; |
| if (!att->clear || !view || !(view->vk.aspects & aspect)) |
| return; |
| |
| const uint32_t layer_count = render->view_mask |
| ? util_last_bit(render->view_mask) |
| : render->layer_count; |
| |
| const VkImageSubresourceRange range = { |
| .layerCount = layer_count, |
| .levelCount = 1, |
| .baseArrayLayer = view->vk.base_array_layer, |
| .baseMipLevel = view->vk.base_mip_level, |
| .aspectMask = view->vk.aspects & aspect, |
| }; |
| |
| assert(util_bitcount(range.aspectMask) == 1); |
| VkFormat format = att->vk_format; |
| |
| if (aspect == VK_IMAGE_ASPECT_DEPTH_BIT) { |
| format = vk_format_depth_only(format); |
| } else if (aspect == VK_IMAGE_ASPECT_STENCIL_BIT) { |
| format = vk_format_stencil_only(format); |
| } |
| |
| struct hk_image *image = container_of(view->vk.image, struct hk_image, vk); |
| hk_clear_image(cmd, image, vk_format_to_pipe_format(format), |
| att->clear_colour, &range, false /* partially clear 3D */); |
| } |
| |
| void |
| hk_optimize_empty_vdm(struct hk_cmd_buffer *cmd) |
| { |
| struct hk_cs *cs = cmd->current_cs.gfx; |
| struct hk_rendering_state *render = &cmd->state.gfx.render; |
| |
| for (unsigned i = 0; i < render->color_att_count; ++i) { |
| clear_attachment_as_image(cmd, render, &render->color_att[i], ~0); |
| } |
| |
| clear_attachment_as_image(cmd, render, &render->depth_att, |
| VK_IMAGE_ASPECT_DEPTH_BIT); |
| |
| clear_attachment_as_image(cmd, render, &render->stencil_att, |
| VK_IMAGE_ASPECT_STENCIL_BIT); |
| |
| /* Remove the VDM control stream from the command buffer, now that it is |
| * replaced by equivalent other operations. |
| */ |
| list_del(&cs->node); |
| hk_cs_destroy(cs); |
| } |