| /* |
| * Copyright © 2022 Imagination Technologies Ltd. |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy |
| * of this software and associated documentation files (the "Software"), to deal |
| * in the Software without restriction, including without limitation the rights |
| * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| * copies of the Software, and to permit persons to whom the Software is |
| * furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| */ |
| |
| #include <assert.h> |
| #include <limits.h> |
| #include <stdbool.h> |
| #include <stddef.h> |
| #include <stdint.h> |
| #include <string.h> |
| #include <vulkan/vulkan.h> |
| |
| #include "hwdef/rogue_hw_defs.h" |
| #include "hwdef/rogue_hw_utils.h" |
| #include "pvr_blit.h" |
| #include "pvr_bo.h" |
| #include "pvr_clear.h" |
| #include "pvr_common.h" |
| #include "pvr_csb.h" |
| #include "pvr_csb_enum_helpers.h" |
| #include "pvr_device_info.h" |
| #include "pvr_formats.h" |
| #include "pvr_hardcode.h" |
| #include "pvr_hw_pass.h" |
| #include "pvr_job_common.h" |
| #include "pvr_job_render.h" |
| #include "pvr_limits.h" |
| #include "pvr_pds.h" |
| #include "pvr_private.h" |
| #include "pvr_tex_state.h" |
| #include "pvr_types.h" |
| #include "usc/pvr_uscgen.h" |
| #include "pvr_winsys.h" |
| #include "util/bitscan.h" |
| #include "util/bitset.h" |
| #include "util/compiler.h" |
| #include "util/list.h" |
| #include "util/macros.h" |
| #include "util/u_dynarray.h" |
| #include "util/u_math.h" |
| #include "util/u_pack_color.h" |
| #include "vk_alloc.h" |
| #include "vk_command_buffer.h" |
| #include "vk_command_pool.h" |
| #include "vk_common_entrypoints.h" |
| #include "vk_format.h" |
| #include "vk_graphics_state.h" |
| #include "vk_log.h" |
| #include "vk_object.h" |
| #include "vk_pipeline_layout.h" |
| #include "vk_util.h" |
| |
| /* Structure used to pass data into pvr_compute_generate_control_stream() |
| * function. |
| */ |
| struct pvr_compute_kernel_info { |
| pvr_dev_addr_t indirect_buffer_addr; |
| bool global_offsets_present; |
| uint32_t usc_common_size; |
| uint32_t usc_unified_size; |
| uint32_t pds_temp_size; |
| uint32_t pds_data_size; |
| enum ROGUE_CDMCTRL_USC_TARGET usc_target; |
| bool is_fence; |
| uint32_t pds_data_offset; |
| uint32_t pds_code_offset; |
| enum ROGUE_CDMCTRL_SD_TYPE sd_type; |
| bool usc_common_shared; |
| uint32_t global_size[PVR_WORKGROUP_DIMENSIONS]; |
| uint32_t local_size[PVR_WORKGROUP_DIMENSIONS]; |
| uint32_t max_instances; |
| }; |
| |
| static void pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd *sub_cmd) |
| { |
| if (sub_cmd->owned) { |
| switch (sub_cmd->type) { |
| case PVR_SUB_CMD_TYPE_GRAPHICS: |
| util_dynarray_fini(&sub_cmd->gfx.sec_query_indices); |
| pvr_csb_finish(&sub_cmd->gfx.control_stream); |
| pvr_bo_free(cmd_buffer->device, sub_cmd->gfx.terminate_ctrl_stream); |
| pvr_bo_suballoc_free(sub_cmd->gfx.depth_bias_bo); |
| pvr_bo_suballoc_free(sub_cmd->gfx.scissor_bo); |
| break; |
| |
| case PVR_SUB_CMD_TYPE_COMPUTE: |
| case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY: |
| pvr_csb_finish(&sub_cmd->compute.control_stream); |
| break; |
| |
| case PVR_SUB_CMD_TYPE_TRANSFER: |
| list_for_each_entry_safe (struct pvr_transfer_cmd, |
| transfer_cmd, |
| sub_cmd->transfer.transfer_cmds, |
| link) { |
| list_del(&transfer_cmd->link); |
| if (!transfer_cmd->is_deferred_clear) |
| vk_free(&cmd_buffer->vk.pool->alloc, transfer_cmd); |
| } |
| break; |
| |
| case PVR_SUB_CMD_TYPE_EVENT: |
| if (sub_cmd->event.type == PVR_EVENT_TYPE_WAIT) |
| vk_free(&cmd_buffer->vk.pool->alloc, sub_cmd->event.wait.events); |
| break; |
| |
| default: |
| unreachable("Unsupported sub-command type"); |
| } |
| } |
| |
| list_del(&sub_cmd->link); |
| vk_free(&cmd_buffer->vk.pool->alloc, sub_cmd); |
| } |
| |
| static void pvr_cmd_buffer_free_sub_cmds(struct pvr_cmd_buffer *cmd_buffer) |
| { |
| list_for_each_entry_safe (struct pvr_sub_cmd, |
| sub_cmd, |
| &cmd_buffer->sub_cmds, |
| link) { |
| pvr_cmd_buffer_free_sub_cmd(cmd_buffer, sub_cmd); |
| } |
| } |
| |
| static void pvr_cmd_buffer_free_resources(struct pvr_cmd_buffer *cmd_buffer) |
| { |
| vk_free(&cmd_buffer->vk.pool->alloc, |
| cmd_buffer->state.render_pass_info.attachments); |
| vk_free(&cmd_buffer->vk.pool->alloc, |
| cmd_buffer->state.render_pass_info.clear_values); |
| |
| util_dynarray_fini(&cmd_buffer->state.query_indices); |
| |
| pvr_cmd_buffer_free_sub_cmds(cmd_buffer); |
| |
| list_for_each_entry_safe (struct pvr_suballoc_bo, |
| suballoc_bo, |
| &cmd_buffer->bo_list, |
| link) { |
| list_del(&suballoc_bo->link); |
| pvr_bo_suballoc_free(suballoc_bo); |
| } |
| |
| util_dynarray_fini(&cmd_buffer->deferred_clears); |
| util_dynarray_fini(&cmd_buffer->deferred_csb_commands); |
| util_dynarray_fini(&cmd_buffer->scissor_array); |
| util_dynarray_fini(&cmd_buffer->depth_bias_array); |
| } |
| |
| static void pvr_cmd_buffer_reset(struct vk_command_buffer *vk_cmd_buffer, |
| VkCommandBufferResetFlags flags) |
| { |
| struct pvr_cmd_buffer *cmd_buffer = |
| container_of(vk_cmd_buffer, struct pvr_cmd_buffer, vk); |
| |
| /* FIXME: For now we always free all resources as if |
| * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set. |
| */ |
| pvr_cmd_buffer_free_resources(cmd_buffer); |
| |
| vk_command_buffer_reset(&cmd_buffer->vk); |
| |
| memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state)); |
| memset(&cmd_buffer->scissor_words, 0, sizeof(cmd_buffer->scissor_words)); |
| |
| cmd_buffer->usage_flags = 0; |
| } |
| |
| static void pvr_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer) |
| { |
| struct pvr_cmd_buffer *cmd_buffer = |
| container_of(vk_cmd_buffer, struct pvr_cmd_buffer, vk); |
| |
| pvr_cmd_buffer_free_resources(cmd_buffer); |
| vk_command_buffer_finish(&cmd_buffer->vk); |
| vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer); |
| } |
| |
| static const struct vk_command_buffer_ops cmd_buffer_ops = { |
| .reset = pvr_cmd_buffer_reset, |
| .destroy = pvr_cmd_buffer_destroy, |
| }; |
| |
| static VkResult pvr_cmd_buffer_create(struct pvr_device *device, |
| struct vk_command_pool *pool, |
| VkCommandBufferLevel level, |
| VkCommandBuffer *pCommandBuffer) |
| { |
| struct pvr_cmd_buffer *cmd_buffer; |
| VkResult result; |
| |
| cmd_buffer = vk_zalloc(&pool->alloc, |
| sizeof(*cmd_buffer), |
| 8U, |
| VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); |
| if (!cmd_buffer) |
| return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| |
| result = |
| vk_command_buffer_init(pool, &cmd_buffer->vk, &cmd_buffer_ops, level); |
| if (result != VK_SUCCESS) { |
| vk_free(&pool->alloc, cmd_buffer); |
| return result; |
| } |
| |
| cmd_buffer->device = device; |
| |
| util_dynarray_init(&cmd_buffer->depth_bias_array, NULL); |
| util_dynarray_init(&cmd_buffer->scissor_array, NULL); |
| util_dynarray_init(&cmd_buffer->deferred_csb_commands, NULL); |
| util_dynarray_init(&cmd_buffer->deferred_clears, NULL); |
| |
| list_inithead(&cmd_buffer->sub_cmds); |
| list_inithead(&cmd_buffer->bo_list); |
| |
| *pCommandBuffer = pvr_cmd_buffer_to_handle(cmd_buffer); |
| |
| return VK_SUCCESS; |
| } |
| |
| VkResult |
| pvr_AllocateCommandBuffers(VkDevice _device, |
| const VkCommandBufferAllocateInfo *pAllocateInfo, |
| VkCommandBuffer *pCommandBuffers) |
| { |
| VK_FROM_HANDLE(vk_command_pool, pool, pAllocateInfo->commandPool); |
| PVR_FROM_HANDLE(pvr_device, device, _device); |
| VkResult result = VK_SUCCESS; |
| uint32_t i; |
| |
| for (i = 0; i < pAllocateInfo->commandBufferCount; i++) { |
| result = pvr_cmd_buffer_create(device, |
| pool, |
| pAllocateInfo->level, |
| &pCommandBuffers[i]); |
| if (result != VK_SUCCESS) |
| break; |
| } |
| |
| if (result != VK_SUCCESS) { |
| while (i--) { |
| VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, pCommandBuffers[i]); |
| pvr_cmd_buffer_destroy(cmd_buffer); |
| } |
| |
| for (i = 0; i < pAllocateInfo->commandBufferCount; i++) |
| pCommandBuffers[i] = VK_NULL_HANDLE; |
| } |
| |
| return result; |
| } |
| |
| static void pvr_cmd_buffer_update_barriers(struct pvr_cmd_buffer *cmd_buffer, |
| enum pvr_sub_cmd_type type) |
| { |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| uint32_t barriers; |
| |
| switch (type) { |
| case PVR_SUB_CMD_TYPE_GRAPHICS: |
| barriers = PVR_PIPELINE_STAGE_GEOM_BIT | PVR_PIPELINE_STAGE_FRAG_BIT; |
| break; |
| |
| case PVR_SUB_CMD_TYPE_COMPUTE: |
| barriers = PVR_PIPELINE_STAGE_COMPUTE_BIT; |
| break; |
| |
| case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY: |
| case PVR_SUB_CMD_TYPE_TRANSFER: |
| /* Compute jobs are used for occlusion queries but to copy the results we |
| * have to sync with transfer jobs because vkCmdCopyQueryPoolResults() is |
| * deemed as a transfer operation by the spec. |
| */ |
| barriers = PVR_PIPELINE_STAGE_TRANSFER_BIT; |
| break; |
| |
| case PVR_SUB_CMD_TYPE_EVENT: |
| barriers = 0; |
| break; |
| |
| default: |
| unreachable("Unsupported sub-command type"); |
| } |
| |
| for (uint32_t i = 0; i < ARRAY_SIZE(state->barriers_needed); i++) |
| state->barriers_needed[i] |= barriers; |
| } |
| |
| static VkResult |
| pvr_cmd_buffer_upload_tables(struct pvr_device *device, |
| struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd_gfx *const sub_cmd) |
| { |
| const uint32_t cache_line_size = |
| rogue_get_slc_cache_line_size(&device->pdevice->dev_info); |
| VkResult result; |
| |
| assert(!sub_cmd->depth_bias_bo && !sub_cmd->scissor_bo); |
| |
| if (cmd_buffer->depth_bias_array.size > 0) { |
| result = |
| pvr_gpu_upload(device, |
| device->heaps.general_heap, |
| util_dynarray_begin(&cmd_buffer->depth_bias_array), |
| cmd_buffer->depth_bias_array.size, |
| cache_line_size, |
| &sub_cmd->depth_bias_bo); |
| if (result != VK_SUCCESS) |
| return result; |
| } |
| |
| if (cmd_buffer->scissor_array.size > 0) { |
| result = pvr_gpu_upload(device, |
| device->heaps.general_heap, |
| util_dynarray_begin(&cmd_buffer->scissor_array), |
| cmd_buffer->scissor_array.size, |
| cache_line_size, |
| &sub_cmd->scissor_bo); |
| if (result != VK_SUCCESS) |
| goto err_free_depth_bias_bo; |
| } |
| |
| util_dynarray_clear(&cmd_buffer->depth_bias_array); |
| util_dynarray_clear(&cmd_buffer->scissor_array); |
| |
| return VK_SUCCESS; |
| |
| err_free_depth_bias_bo: |
| pvr_bo_suballoc_free(sub_cmd->depth_bias_bo); |
| sub_cmd->depth_bias_bo = NULL; |
| |
| return result; |
| } |
| |
| static VkResult |
| pvr_cmd_buffer_emit_ppp_state(const struct pvr_cmd_buffer *const cmd_buffer, |
| struct pvr_csb *const csb) |
| { |
| const struct pvr_framebuffer *const framebuffer = |
| cmd_buffer->state.render_pass_info.framebuffer; |
| |
| assert(csb->stream_type == PVR_CMD_STREAM_TYPE_GRAPHICS || |
| csb->stream_type == PVR_CMD_STREAM_TYPE_GRAPHICS_DEFERRED); |
| |
| pvr_csb_set_relocation_mark(csb); |
| |
| pvr_csb_emit (csb, VDMCTRL_PPP_STATE0, state0) { |
| state0.addrmsb = framebuffer->ppp_state_bo->dev_addr; |
| state0.word_count = framebuffer->ppp_state_size; |
| } |
| |
| pvr_csb_emit (csb, VDMCTRL_PPP_STATE1, state1) { |
| state1.addrlsb = framebuffer->ppp_state_bo->dev_addr; |
| } |
| |
| pvr_csb_clear_relocation_mark(csb); |
| |
| return csb->status; |
| } |
| |
| VkResult |
| pvr_cmd_buffer_upload_general(struct pvr_cmd_buffer *const cmd_buffer, |
| const void *const data, |
| const size_t size, |
| struct pvr_suballoc_bo **const pvr_bo_out) |
| { |
| struct pvr_device *const device = cmd_buffer->device; |
| const uint32_t cache_line_size = |
| rogue_get_slc_cache_line_size(&device->pdevice->dev_info); |
| struct pvr_suballoc_bo *suballoc_bo; |
| VkResult result; |
| |
| result = pvr_gpu_upload(device, |
| device->heaps.general_heap, |
| data, |
| size, |
| cache_line_size, |
| &suballoc_bo); |
| if (result != VK_SUCCESS) |
| return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result); |
| |
| list_add(&suballoc_bo->link, &cmd_buffer->bo_list); |
| |
| *pvr_bo_out = suballoc_bo; |
| |
| return VK_SUCCESS; |
| } |
| |
| static VkResult |
| pvr_cmd_buffer_upload_usc(struct pvr_cmd_buffer *const cmd_buffer, |
| const void *const code, |
| const size_t code_size, |
| uint64_t code_alignment, |
| struct pvr_suballoc_bo **const pvr_bo_out) |
| { |
| struct pvr_device *const device = cmd_buffer->device; |
| const uint32_t cache_line_size = |
| rogue_get_slc_cache_line_size(&device->pdevice->dev_info); |
| struct pvr_suballoc_bo *suballoc_bo; |
| VkResult result; |
| |
| code_alignment = MAX2(code_alignment, cache_line_size); |
| |
| result = |
| pvr_gpu_upload_usc(device, code, code_size, code_alignment, &suballoc_bo); |
| if (result != VK_SUCCESS) |
| return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result); |
| |
| list_add(&suballoc_bo->link, &cmd_buffer->bo_list); |
| |
| *pvr_bo_out = suballoc_bo; |
| |
| return VK_SUCCESS; |
| } |
| |
| VkResult pvr_cmd_buffer_upload_pds(struct pvr_cmd_buffer *const cmd_buffer, |
| const uint32_t *data, |
| uint32_t data_size_dwords, |
| uint32_t data_alignment, |
| const uint32_t *code, |
| uint32_t code_size_dwords, |
| uint32_t code_alignment, |
| uint64_t min_alignment, |
| struct pvr_pds_upload *const pds_upload_out) |
| { |
| struct pvr_device *const device = cmd_buffer->device; |
| VkResult result; |
| |
| result = pvr_gpu_upload_pds(device, |
| data, |
| data_size_dwords, |
| data_alignment, |
| code, |
| code_size_dwords, |
| code_alignment, |
| min_alignment, |
| pds_upload_out); |
| if (result != VK_SUCCESS) |
| return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result); |
| |
| list_add(&pds_upload_out->pvr_bo->link, &cmd_buffer->bo_list); |
| |
| return VK_SUCCESS; |
| } |
| |
| static inline VkResult |
| pvr_cmd_buffer_upload_pds_data(struct pvr_cmd_buffer *const cmd_buffer, |
| const uint32_t *data, |
| uint32_t data_size_dwords, |
| uint32_t data_alignment, |
| struct pvr_pds_upload *const pds_upload_out) |
| { |
| return pvr_cmd_buffer_upload_pds(cmd_buffer, |
| data, |
| data_size_dwords, |
| data_alignment, |
| NULL, |
| 0, |
| 0, |
| data_alignment, |
| pds_upload_out); |
| } |
| |
| /* pbe_cs_words must be an array of length emit_count with |
| * ROGUE_NUM_PBESTATE_STATE_WORDS entries |
| */ |
| static VkResult pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload( |
| struct pvr_cmd_buffer *const cmd_buffer, |
| const uint32_t emit_count, |
| const uint32_t *pbe_cs_words, |
| struct pvr_pds_upload *const pds_upload_out) |
| { |
| struct pvr_pds_event_program pixel_event_program = { |
| /* No data to DMA, just a DOUTU needed. */ |
| .num_emit_word_pairs = 0, |
| }; |
| const uint32_t staging_buffer_size = |
| PVR_DW_TO_BYTES(cmd_buffer->device->pixel_event_data_size_in_dwords); |
| const VkAllocationCallbacks *const allocator = &cmd_buffer->vk.pool->alloc; |
| struct pvr_device *const device = cmd_buffer->device; |
| struct pvr_suballoc_bo *usc_eot_program = NULL; |
| struct util_dynarray eot_program_bin; |
| uint32_t *staging_buffer; |
| uint32_t usc_temp_count; |
| VkResult result; |
| |
| assert(emit_count > 0); |
| |
| pvr_uscgen_eot("per-job EOT", |
| emit_count, |
| pbe_cs_words, |
| &usc_temp_count, |
| &eot_program_bin); |
| |
| result = pvr_cmd_buffer_upload_usc(cmd_buffer, |
| eot_program_bin.data, |
| eot_program_bin.size, |
| 4, |
| &usc_eot_program); |
| |
| util_dynarray_fini(&eot_program_bin); |
| |
| if (result != VK_SUCCESS) |
| return result; |
| |
| pvr_pds_setup_doutu(&pixel_event_program.task_control, |
| usc_eot_program->dev_addr.addr, |
| usc_temp_count, |
| ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE, |
| false); |
| |
| /* TODO: We could skip allocating this and generate directly into the device |
| * buffer thus removing one allocation and memcpy() per job. Would this |
| * speed up things in a noticeable way? |
| */ |
| staging_buffer = vk_alloc(allocator, |
| staging_buffer_size, |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); |
| if (!staging_buffer) { |
| result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| goto err_free_usc_pixel_program; |
| } |
| |
| /* Generate the data segment. The code segment was uploaded earlier when |
| * setting up the PDS static heap data. |
| */ |
| pvr_pds_generate_pixel_event_data_segment(&pixel_event_program, |
| staging_buffer, |
| &device->pdevice->dev_info); |
| |
| result = pvr_cmd_buffer_upload_pds_data( |
| cmd_buffer, |
| staging_buffer, |
| cmd_buffer->device->pixel_event_data_size_in_dwords, |
| 4, |
| pds_upload_out); |
| if (result != VK_SUCCESS) |
| goto err_free_pixel_event_staging_buffer; |
| |
| vk_free(allocator, staging_buffer); |
| |
| return VK_SUCCESS; |
| |
| err_free_pixel_event_staging_buffer: |
| vk_free(allocator, staging_buffer); |
| |
| err_free_usc_pixel_program: |
| list_del(&usc_eot_program->link); |
| pvr_bo_suballoc_free(usc_eot_program); |
| |
| return result; |
| } |
| |
| static VkResult pvr_sub_cmd_gfx_build_terminate_ctrl_stream( |
| struct pvr_device *const device, |
| const struct pvr_cmd_buffer *const cmd_buffer, |
| struct pvr_sub_cmd_gfx *const gfx_sub_cmd) |
| { |
| struct list_head bo_list; |
| struct pvr_csb csb; |
| VkResult result; |
| |
| pvr_csb_init(device, PVR_CMD_STREAM_TYPE_GRAPHICS, &csb); |
| |
| result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer, &csb); |
| if (result != VK_SUCCESS) |
| goto err_csb_finish; |
| |
| result = pvr_csb_emit_terminate(&csb); |
| if (result != VK_SUCCESS) |
| goto err_csb_finish; |
| |
| result = pvr_csb_bake(&csb, &bo_list); |
| if (result != VK_SUCCESS) |
| goto err_csb_finish; |
| |
| /* This is a trivial control stream, there's no reason it should ever require |
| * more memory than a single bo can provide. |
| */ |
| assert(list_is_singular(&bo_list)); |
| gfx_sub_cmd->terminate_ctrl_stream = |
| list_first_entry(&bo_list, struct pvr_bo, link); |
| |
| return VK_SUCCESS; |
| |
| err_csb_finish: |
| pvr_csb_finish(&csb); |
| |
| return result; |
| } |
| |
| static VkResult pvr_setup_texture_state_words( |
| struct pvr_device *device, |
| struct pvr_combined_image_sampler_descriptor *descriptor, |
| const struct pvr_image_view *image_view) |
| { |
| const struct pvr_image *image = vk_to_pvr_image(image_view->vk.image); |
| struct pvr_texture_state_info info = { |
| .format = image_view->vk.format, |
| .mem_layout = image->memlayout, |
| .type = image_view->vk.view_type, |
| .is_cube = image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE || |
| image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY, |
| .tex_state_type = PVR_TEXTURE_STATE_SAMPLE, |
| .extent = image_view->vk.extent, |
| .mip_levels = 1, |
| .sample_count = image_view->vk.image->samples, |
| .stride = image->physical_extent.width, |
| .addr = image->dev_addr, |
| }; |
| const uint8_t *const swizzle = pvr_get_format_swizzle(info.format); |
| VkResult result; |
| |
| memcpy(&info.swizzle, swizzle, sizeof(info.swizzle)); |
| |
| /* TODO: Can we use image_view->texture_state instead of generating here? */ |
| result = pvr_pack_tex_state(device, &info, &descriptor->image); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| pvr_csb_pack (&descriptor->sampler.words[0], |
| TEXSTATE_SAMPLER_WORD0, |
| sampler) { |
| sampler.non_normalized_coords = true; |
| sampler.addrmode_v = ROGUE_TEXSTATE_ADDRMODE_CLAMP_TO_EDGE; |
| sampler.addrmode_u = ROGUE_TEXSTATE_ADDRMODE_CLAMP_TO_EDGE; |
| sampler.minfilter = ROGUE_TEXSTATE_FILTER_POINT; |
| sampler.magfilter = ROGUE_TEXSTATE_FILTER_POINT; |
| sampler.dadjust = ROGUE_TEXSTATE_DADJUST_ZERO_UINT; |
| } |
| |
| pvr_csb_pack (&descriptor->sampler.words[1], |
| TEXSTATE_SAMPLER_WORD1, |
| sampler) { |
| } |
| |
| return VK_SUCCESS; |
| } |
| |
| static VkResult |
| pvr_load_op_constants_create_and_upload(struct pvr_cmd_buffer *cmd_buffer, |
| const struct pvr_load_op *load_op, |
| pvr_dev_addr_t *const addr_out) |
| { |
| const struct pvr_render_pass_info *render_pass_info = |
| &cmd_buffer->state.render_pass_info; |
| const struct pvr_render_pass *pass = render_pass_info->pass; |
| const struct pvr_renderpass_hwsetup_render *hw_render = load_op->hw_render; |
| const struct pvr_renderpass_colorinit *color_init = |
| &hw_render->color_init[0]; |
| const VkClearValue *clear_value = |
| &render_pass_info->clear_values[color_init->index]; |
| struct pvr_suballoc_bo *clear_bo; |
| uint32_t attachment_count; |
| bool has_depth_clear; |
| bool has_depth_load; |
| VkResult result; |
| |
| /* These are only setup and never used for now. These will need to be |
| * uploaded into a buffer based on some compiler info. |
| */ |
| /* TODO: Remove the above comment once the compiler is hooked up and we're |
| * setting up + uploading the buffer. |
| */ |
| struct pvr_combined_image_sampler_descriptor |
| texture_states[PVR_LOAD_OP_CLEARS_LOADS_MAX_RTS]; |
| uint32_t texture_count = 0; |
| uint32_t hw_clear_value[PVR_LOAD_OP_CLEARS_LOADS_MAX_RTS * |
| PVR_CLEAR_COLOR_ARRAY_SIZE]; |
| uint32_t next_clear_consts = 0; |
| |
| if (load_op->is_hw_object) |
| attachment_count = load_op->hw_render->color_init_count; |
| else |
| attachment_count = load_op->subpass->color_count; |
| |
| for (uint32_t i = 0; i < attachment_count; i++) { |
| struct pvr_image_view *image_view; |
| uint32_t attachment_idx; |
| |
| if (load_op->is_hw_object) |
| attachment_idx = load_op->hw_render->color_init[i].index; |
| else |
| attachment_idx = load_op->subpass->color_attachments[i]; |
| |
| image_view = render_pass_info->attachments[attachment_idx]; |
| |
| assert((load_op->clears_loads_state.rt_load_mask & |
| load_op->clears_loads_state.rt_clear_mask) == 0); |
| if (load_op->clears_loads_state.rt_load_mask & BITFIELD_BIT(i)) { |
| result = pvr_setup_texture_state_words(cmd_buffer->device, |
| &texture_states[texture_count], |
| image_view); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| texture_count++; |
| } else if (load_op->clears_loads_state.rt_clear_mask & BITFIELD_BIT(i)) { |
| const uint32_t accum_fmt_size = |
| pvr_get_pbe_accum_format_size_in_bytes(image_view->vk.format); |
| |
| assert(next_clear_consts + |
| vk_format_get_blocksize(image_view->vk.format) <= |
| ARRAY_SIZE(hw_clear_value)); |
| |
| /* FIXME: do this at the point we store the clear values? */ |
| pvr_get_hw_clear_color(image_view->vk.format, |
| clear_value->color, |
| &hw_clear_value[next_clear_consts]); |
| |
| next_clear_consts += DIV_ROUND_UP(accum_fmt_size, sizeof(uint32_t)); |
| } |
| } |
| |
| has_depth_load = false; |
| for (uint32_t i = 0; |
| i < ARRAY_SIZE(load_op->clears_loads_state.dest_vk_format); |
| i++) { |
| if (load_op->clears_loads_state.dest_vk_format[i] == |
| VK_FORMAT_D32_SFLOAT) { |
| has_depth_load = true; |
| break; |
| } |
| } |
| |
| has_depth_clear = load_op->clears_loads_state.depth_clear_to_reg != -1; |
| |
| assert(!(has_depth_clear && has_depth_load)); |
| |
| if (has_depth_load) { |
| const struct pvr_render_pass_attachment *attachment; |
| const struct pvr_image_view *image_view; |
| |
| assert(load_op->subpass->depth_stencil_attachment != |
| VK_ATTACHMENT_UNUSED); |
| assert(!load_op->is_hw_object); |
| attachment = |
| &pass->attachments[load_op->subpass->depth_stencil_attachment]; |
| |
| image_view = render_pass_info->attachments[attachment->index]; |
| |
| result = pvr_setup_texture_state_words(cmd_buffer->device, |
| &texture_states[texture_count], |
| image_view); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| texture_count++; |
| } else if (has_depth_clear) { |
| const struct pvr_render_pass_attachment *attachment; |
| VkClearValue clear_value; |
| |
| assert(load_op->subpass->depth_stencil_attachment != |
| VK_ATTACHMENT_UNUSED); |
| attachment = |
| &pass->attachments[load_op->subpass->depth_stencil_attachment]; |
| |
| clear_value = render_pass_info->clear_values[attachment->index]; |
| |
| assert(next_clear_consts < ARRAY_SIZE(hw_clear_value)); |
| hw_clear_value[next_clear_consts++] = fui(clear_value.depthStencil.depth); |
| } |
| |
| result = pvr_cmd_buffer_upload_general(cmd_buffer, |
| &hw_clear_value[0], |
| sizeof(hw_clear_value), |
| &clear_bo); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| *addr_out = clear_bo->dev_addr; |
| |
| return VK_SUCCESS; |
| } |
| |
| static VkResult pvr_load_op_pds_data_create_and_upload( |
| struct pvr_cmd_buffer *cmd_buffer, |
| const struct pvr_load_op *load_op, |
| pvr_dev_addr_t constants_addr, |
| struct pvr_pds_upload *const pds_upload_out) |
| { |
| struct pvr_device *device = cmd_buffer->device; |
| const struct pvr_device_info *dev_info = &device->pdevice->dev_info; |
| struct pvr_pds_pixel_shader_sa_program program = { 0 }; |
| uint32_t staging_buffer_size; |
| uint32_t *staging_buffer; |
| VkResult result; |
| |
| program.num_texture_dma_kicks = 1; |
| |
| pvr_csb_pack (&program.texture_dma_address[0], |
| PDSINST_DOUT_FIELDS_DOUTD_SRC0, |
| value) { |
| value.sbase = constants_addr; |
| } |
| |
| pvr_csb_pack (&program.texture_dma_control[0], |
| PDSINST_DOUT_FIELDS_DOUTD_SRC1, |
| value) { |
| value.dest = ROGUE_PDSINST_DOUTD_DEST_COMMON_STORE; |
| value.a0 = load_op->shareds_dest_offset; |
| value.bsize = load_op->shareds_count; |
| } |
| |
| pvr_pds_set_sizes_pixel_shader_sa_texture_data(&program, dev_info); |
| |
| staging_buffer_size = PVR_DW_TO_BYTES(program.data_size); |
| |
| staging_buffer = vk_alloc(&cmd_buffer->vk.pool->alloc, |
| staging_buffer_size, |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); |
| if (!staging_buffer) |
| return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| |
| pvr_pds_generate_pixel_shader_sa_texture_state_data(&program, |
| staging_buffer, |
| dev_info); |
| |
| result = pvr_cmd_buffer_upload_pds_data(cmd_buffer, |
| staging_buffer, |
| program.data_size, |
| 1, |
| pds_upload_out); |
| if (result != VK_SUCCESS) { |
| vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer); |
| return result; |
| } |
| |
| vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer); |
| |
| return VK_SUCCESS; |
| } |
| |
| /* FIXME: Should this function be specific to the HW background object, in |
| * which case its name should be changed, or should it have the load op |
| * structure passed in? |
| */ |
| static VkResult |
| pvr_load_op_data_create_and_upload(struct pvr_cmd_buffer *cmd_buffer, |
| const struct pvr_load_op *load_op, |
| struct pvr_pds_upload *const pds_upload_out) |
| { |
| pvr_dev_addr_t constants_addr; |
| VkResult result; |
| |
| result = pvr_load_op_constants_create_and_upload(cmd_buffer, |
| load_op, |
| &constants_addr); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| return pvr_load_op_pds_data_create_and_upload(cmd_buffer, |
| load_op, |
| constants_addr, |
| pds_upload_out); |
| } |
| |
| static void pvr_pds_bgnd_pack_state( |
| const struct pvr_load_op *load_op, |
| const struct pvr_pds_upload *load_op_program, |
| uint64_t pds_reg_values[static const ROGUE_NUM_CR_PDS_BGRND_WORDS]) |
| { |
| pvr_csb_pack (&pds_reg_values[0], CR_PDS_BGRND0_BASE, value) { |
| value.shader_addr = PVR_DEV_ADDR(load_op->pds_frag_prog.data_offset); |
| value.texunicode_addr = |
| PVR_DEV_ADDR(load_op->pds_tex_state_prog.code_offset); |
| } |
| |
| pvr_csb_pack (&pds_reg_values[1], CR_PDS_BGRND1_BASE, value) { |
| value.texturedata_addr = PVR_DEV_ADDR(load_op_program->data_offset); |
| } |
| |
| pvr_csb_pack (&pds_reg_values[2], CR_PDS_BGRND3_SIZEINFO, value) { |
| value.usc_sharedsize = |
| DIV_ROUND_UP(load_op->const_shareds_count, |
| ROGUE_CR_PDS_BGRND3_SIZEINFO_USC_SHAREDSIZE_UNIT_SIZE); |
| value.pds_texturestatesize = DIV_ROUND_UP( |
| load_op_program->data_size, |
| ROGUE_CR_PDS_BGRND3_SIZEINFO_PDS_TEXTURESTATESIZE_UNIT_SIZE); |
| value.pds_tempsize = |
| DIV_ROUND_UP(load_op->temps_count, |
| ROGUE_CR_PDS_BGRND3_SIZEINFO_PDS_TEMPSIZE_UNIT_SIZE); |
| } |
| } |
| |
| /** |
| * \brief Calculates the stride in pixels based on the pitch in bytes and pixel |
| * format. |
| * |
| * \param[in] pitch Width pitch in bytes. |
| * \param[in] vk_format Vulkan image format. |
| * \return Stride in pixels. |
| */ |
| static inline uint32_t pvr_stride_from_pitch(uint32_t pitch, VkFormat vk_format) |
| { |
| const unsigned int cpp = vk_format_get_blocksize(vk_format); |
| |
| assert(pitch % cpp == 0); |
| |
| return pitch / cpp; |
| } |
| |
| static void pvr_setup_pbe_state( |
| const struct pvr_device_info *dev_info, |
| const struct pvr_framebuffer *framebuffer, |
| uint32_t mrt_index, |
| const struct usc_mrt_resource *mrt_resource, |
| const struct pvr_image_view *const iview, |
| const VkRect2D *render_area, |
| const bool down_scale, |
| const uint32_t samples, |
| uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS], |
| uint64_t pbe_reg_words[static const ROGUE_NUM_PBESTATE_REG_WORDS]) |
| { |
| const struct pvr_image *image = pvr_image_view_get_image(iview); |
| uint32_t level_pitch = image->mip_levels[iview->vk.base_mip_level].pitch; |
| |
| struct pvr_pbe_surf_params surface_params; |
| struct pvr_pbe_render_params render_params; |
| bool with_packed_usc_channel; |
| const uint8_t *swizzle; |
| uint32_t position; |
| |
| /* down_scale should be true when performing a resolve, in which case there |
| * should be more than one sample. |
| */ |
| assert((down_scale && samples > 1U) || (!down_scale && samples == 1U)); |
| |
| /* Setup surface parameters. */ |
| |
| if (PVR_HAS_FEATURE(dev_info, usc_f16sop_u8)) { |
| with_packed_usc_channel = vk_format_is_unorm(iview->vk.format) || |
| vk_format_is_snorm(iview->vk.format); |
| } else { |
| with_packed_usc_channel = false; |
| } |
| |
| swizzle = pvr_get_format_swizzle(iview->vk.format); |
| memcpy(surface_params.swizzle, swizzle, sizeof(surface_params.swizzle)); |
| |
| pvr_pbe_get_src_format_and_gamma(iview->vk.format, |
| PVR_PBE_GAMMA_NONE, |
| with_packed_usc_channel, |
| &surface_params.source_format, |
| &surface_params.gamma); |
| |
| surface_params.is_normalized = |
| pvr_vk_format_is_fully_normalized(iview->vk.format); |
| surface_params.pbe_packmode = pvr_get_pbe_packmode(iview->vk.format); |
| surface_params.nr_components = vk_format_get_nr_components(iview->vk.format); |
| |
| /* FIXME: Should we have an inline function to return the address of a mip |
| * level? |
| */ |
| surface_params.addr = |
| PVR_DEV_ADDR_OFFSET(image->vma->dev_addr, |
| image->mip_levels[iview->vk.base_mip_level].offset); |
| surface_params.addr = |
| PVR_DEV_ADDR_OFFSET(surface_params.addr, |
| iview->vk.base_array_layer * image->layer_size); |
| |
| surface_params.mem_layout = image->memlayout; |
| surface_params.stride = pvr_stride_from_pitch(level_pitch, iview->vk.format); |
| surface_params.depth = iview->vk.extent.depth; |
| surface_params.width = iview->vk.extent.width; |
| surface_params.height = iview->vk.extent.height; |
| surface_params.z_only_render = false; |
| surface_params.down_scale = down_scale; |
| |
| /* Setup render parameters. */ |
| |
| if (mrt_resource->type == USC_MRT_RESOURCE_TYPE_MEMORY) { |
| position = mrt_resource->mem.offset_dw; |
| } else { |
| assert(mrt_resource->type == USC_MRT_RESOURCE_TYPE_OUTPUT_REG); |
| assert(mrt_resource->reg.offset == 0); |
| |
| position = mrt_resource->reg.output_reg; |
| } |
| |
| assert(position <= 3 || PVR_HAS_FEATURE(dev_info, eight_output_registers)); |
| |
| switch (position) { |
| case 0: |
| case 4: |
| render_params.source_start = PVR_PBE_STARTPOS_BIT0; |
| break; |
| case 1: |
| case 5: |
| render_params.source_start = PVR_PBE_STARTPOS_BIT32; |
| break; |
| case 2: |
| case 6: |
| render_params.source_start = PVR_PBE_STARTPOS_BIT64; |
| break; |
| case 3: |
| case 7: |
| render_params.source_start = PVR_PBE_STARTPOS_BIT96; |
| break; |
| default: |
| assert(!"Invalid output register"); |
| break; |
| } |
| |
| #define PVR_DEC_IF_NOT_ZERO(_v) (((_v) > 0) ? (_v)-1 : 0) |
| |
| render_params.min_x_clip = MAX2(0, render_area->offset.x); |
| render_params.min_y_clip = MAX2(0, render_area->offset.y); |
| render_params.max_x_clip = MIN2( |
| framebuffer->width - 1, |
| PVR_DEC_IF_NOT_ZERO(render_area->offset.x + render_area->extent.width)); |
| render_params.max_y_clip = MIN2( |
| framebuffer->height - 1, |
| PVR_DEC_IF_NOT_ZERO(render_area->offset.y + render_area->extent.height)); |
| |
| #undef PVR_DEC_IF_NOT_ZERO |
| |
| render_params.slice = 0; |
| render_params.mrt_index = mrt_index; |
| |
| pvr_pbe_pack_state(dev_info, |
| &surface_params, |
| &render_params, |
| pbe_cs_words, |
| pbe_reg_words); |
| } |
| |
| static struct pvr_render_target * |
| pvr_get_render_target(const struct pvr_render_pass *pass, |
| const struct pvr_framebuffer *framebuffer, |
| uint32_t idx) |
| { |
| const struct pvr_renderpass_hwsetup_render *hw_render = |
| &pass->hw_setup->renders[idx]; |
| uint32_t rt_idx = 0; |
| |
| switch (hw_render->sample_count) { |
| case 1: |
| case 2: |
| case 4: |
| case 8: |
| rt_idx = util_logbase2(hw_render->sample_count); |
| break; |
| |
| default: |
| unreachable("Unsupported sample count"); |
| break; |
| } |
| |
| return &framebuffer->render_targets[rt_idx]; |
| } |
| |
| static uint32_t |
| pvr_pass_get_pixel_output_width(const struct pvr_render_pass *pass, |
| uint32_t idx, |
| const struct pvr_device_info *dev_info) |
| { |
| const struct pvr_renderpass_hwsetup_render *hw_render = |
| &pass->hw_setup->renders[idx]; |
| /* Default value based on the maximum value found in all existing cores. The |
| * maximum is used as this is being treated as a lower bound, making it a |
| * "safer" choice than the minimum value found in all existing cores. |
| */ |
| const uint32_t min_output_regs = |
| PVR_GET_FEATURE_VALUE(dev_info, usc_min_output_registers_per_pix, 2U); |
| const uint32_t width = MAX2(hw_render->output_regs_count, min_output_regs); |
| |
| return util_next_power_of_two(width); |
| } |
| |
| static inline bool |
| pvr_ds_attachment_requires_zls(const struct pvr_ds_attachment *attachment) |
| { |
| bool zls_used; |
| |
| zls_used = attachment->load.d || attachment->load.s; |
| zls_used |= attachment->store.d || attachment->store.s; |
| |
| return zls_used; |
| } |
| |
| /** |
| * \brief If depth and/or stencil attachment dimensions are not tile-aligned, |
| * then we may need to insert some additional transfer subcommands. |
| * |
| * It's worth noting that we check whether the dimensions are smaller than a |
| * tile here, rather than checking whether they're tile-aligned - this relies |
| * on the assumption that we can safely use any attachment with dimensions |
| * larger than a tile. If the attachment is twiddled, it will be over-allocated |
| * to the nearest power-of-two (which will be tile-aligned). If the attachment |
| * is not twiddled, we don't need to worry about tile-alignment at all. |
| */ |
| static bool pvr_sub_cmd_gfx_requires_ds_subtile_alignment( |
| const struct pvr_device_info *dev_info, |
| const struct pvr_render_job *job) |
| { |
| const struct pvr_image *const ds_image = |
| pvr_image_view_get_image(job->ds.iview); |
| uint32_t zls_tile_size_x; |
| uint32_t zls_tile_size_y; |
| |
| rogue_get_zls_tile_size_xy(dev_info, &zls_tile_size_x, &zls_tile_size_y); |
| |
| if (ds_image->physical_extent.width >= zls_tile_size_x && |
| ds_image->physical_extent.height >= zls_tile_size_y) { |
| return false; |
| } |
| |
| /* If we have the zls_subtile feature, we can skip the alignment iff: |
| * - The attachment is not multisampled, and |
| * - The depth and stencil attachments are the same. |
| */ |
| if (PVR_HAS_FEATURE(dev_info, zls_subtile) && |
| ds_image->vk.samples == VK_SAMPLE_COUNT_1_BIT && |
| job->has_stencil_attachment == job->has_depth_attachment) { |
| return false; |
| } |
| |
| /* No ZLS functions enabled; nothing to do. */ |
| if ((!job->has_depth_attachment && !job->has_stencil_attachment) || |
| !pvr_ds_attachment_requires_zls(&job->ds)) { |
| return false; |
| } |
| |
| return true; |
| } |
| |
| static VkResult |
| pvr_sub_cmd_gfx_align_ds_subtiles(struct pvr_cmd_buffer *const cmd_buffer, |
| struct pvr_sub_cmd_gfx *const gfx_sub_cmd) |
| { |
| struct pvr_sub_cmd *const prev_sub_cmd = |
| container_of(gfx_sub_cmd, struct pvr_sub_cmd, gfx); |
| struct pvr_ds_attachment *const ds = &gfx_sub_cmd->job.ds; |
| const struct pvr_image *const ds_image = pvr_image_view_get_image(ds->iview); |
| const VkFormat copy_format = pvr_get_raw_copy_format(ds_image->vk.format); |
| |
| struct pvr_suballoc_bo *buffer; |
| uint32_t buffer_layer_size; |
| VkBufferImageCopy2 region; |
| VkExtent2D zls_tile_size; |
| VkExtent2D rounded_size; |
| uint32_t buffer_size; |
| VkExtent2D scale; |
| VkResult result; |
| |
| /* The operations below assume the last command in the buffer was the target |
| * gfx subcommand. Assert that this is the case. |
| */ |
| assert(list_last_entry(&cmd_buffer->sub_cmds, struct pvr_sub_cmd, link) == |
| prev_sub_cmd); |
| assert(prev_sub_cmd == cmd_buffer->state.current_sub_cmd); |
| |
| if (!pvr_ds_attachment_requires_zls(ds)) |
| return VK_SUCCESS; |
| |
| rogue_get_zls_tile_size_xy(&cmd_buffer->device->pdevice->dev_info, |
| &zls_tile_size.width, |
| &zls_tile_size.height); |
| rogue_get_isp_scale_xy_from_samples(ds_image->vk.samples, |
| &scale.width, |
| &scale.height); |
| |
| rounded_size = (VkExtent2D){ |
| .width = ALIGN_POT(ds_image->physical_extent.width, zls_tile_size.width), |
| .height = |
| ALIGN_POT(ds_image->physical_extent.height, zls_tile_size.height), |
| }; |
| |
| buffer_layer_size = vk_format_get_blocksize(ds_image->vk.format) * |
| rounded_size.width * rounded_size.height * scale.width * |
| scale.height; |
| |
| if (ds->iview->vk.layer_count > 1) |
| buffer_layer_size = ALIGN_POT(buffer_layer_size, ds_image->alignment); |
| |
| buffer_size = buffer_layer_size * ds->iview->vk.layer_count; |
| |
| result = pvr_cmd_buffer_alloc_mem(cmd_buffer, |
| cmd_buffer->device->heaps.general_heap, |
| buffer_size, |
| &buffer); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| region = (VkBufferImageCopy2){ |
| .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2, |
| .pNext = NULL, |
| .bufferOffset = 0, |
| .bufferRowLength = rounded_size.width, |
| .bufferImageHeight = 0, |
| .imageSubresource = { |
| .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, |
| .mipLevel = ds->iview->vk.base_mip_level, |
| .baseArrayLayer = ds->iview->vk.base_array_layer, |
| .layerCount = ds->iview->vk.layer_count, |
| }, |
| .imageOffset = { 0 }, |
| .imageExtent = { |
| .width = ds->iview->vk.extent.width, |
| .height = ds->iview->vk.extent.height, |
| .depth = 1, |
| }, |
| }; |
| |
| if (ds->load.d || ds->load.s) { |
| struct pvr_sub_cmd *new_sub_cmd; |
| |
| cmd_buffer->state.current_sub_cmd = NULL; |
| |
| result = |
| pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| new_sub_cmd = cmd_buffer->state.current_sub_cmd; |
| |
| result = pvr_copy_image_to_buffer_region_format(cmd_buffer, |
| ds_image, |
| buffer->dev_addr, |
| ®ion, |
| copy_format, |
| copy_format); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| new_sub_cmd->transfer.serialize_with_frag = true; |
| |
| result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| /* Now we have to fiddle with cmd_buffer to place this transfer command |
| * *before* the target gfx subcommand. |
| * |
| * Note the doc for list_move_to() is subtly wrong - item is placed |
| * directly *after* loc in the list, not "in front of". |
| */ |
| list_move_to(&new_sub_cmd->link, prev_sub_cmd->link.prev); |
| |
| cmd_buffer->state.current_sub_cmd = prev_sub_cmd; |
| } |
| |
| if (ds->store.d || ds->store.s) { |
| cmd_buffer->state.current_sub_cmd = NULL; |
| |
| result = |
| pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| result = pvr_copy_buffer_to_image_region_format(cmd_buffer, |
| buffer->dev_addr, |
| ds_image, |
| ®ion, |
| copy_format, |
| copy_format, |
| 0); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| cmd_buffer->state.current_sub_cmd->transfer.serialize_with_frag = true; |
| |
| result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| cmd_buffer->state.current_sub_cmd = prev_sub_cmd; |
| } |
| |
| /* Finally, patch up the target graphics sub_cmd to use the correctly-strided |
| * buffer. |
| */ |
| ds->has_alignment_transfers = true; |
| ds->addr = buffer->dev_addr; |
| ds->physical_extent = rounded_size; |
| |
| gfx_sub_cmd->wait_on_previous_transfer = true; |
| |
| return VK_SUCCESS; |
| } |
| |
| struct pvr_emit_state { |
| uint32_t pbe_cs_words[PVR_MAX_COLOR_ATTACHMENTS] |
| [ROGUE_NUM_PBESTATE_STATE_WORDS]; |
| |
| uint64_t pbe_reg_words[PVR_MAX_COLOR_ATTACHMENTS] |
| [ROGUE_NUM_PBESTATE_REG_WORDS]; |
| |
| uint32_t emit_count; |
| }; |
| |
| static void |
| pvr_setup_emit_state(const struct pvr_device_info *dev_info, |
| const struct pvr_renderpass_hwsetup_render *hw_render, |
| struct pvr_render_pass_info *render_pass_info, |
| struct pvr_emit_state *emit_state) |
| { |
| assert(hw_render->pbe_emits <= PVR_NUM_PBE_EMIT_REGS); |
| |
| if (hw_render->eot_surface_count == 0) { |
| emit_state->emit_count = 1; |
| pvr_csb_pack (&emit_state->pbe_cs_words[0][1], |
| PBESTATE_STATE_WORD1, |
| state) { |
| state.emptytile = true; |
| } |
| return; |
| } |
| |
| static_assert(USC_MRT_RESOURCE_TYPE_OUTPUT_REG + 1 == |
| USC_MRT_RESOURCE_TYPE_MEMORY, |
| "The loop below needs adjusting."); |
| |
| emit_state->emit_count = 0; |
| for (uint32_t resource_type = USC_MRT_RESOURCE_TYPE_OUTPUT_REG; |
| resource_type <= USC_MRT_RESOURCE_TYPE_MEMORY; |
| resource_type++) { |
| for (uint32_t i = 0; i < hw_render->eot_surface_count; i++) { |
| const struct pvr_framebuffer *framebuffer = |
| render_pass_info->framebuffer; |
| const struct pvr_renderpass_hwsetup_eot_surface *surface = |
| &hw_render->eot_surfaces[i]; |
| const struct pvr_image_view *iview = |
| render_pass_info->attachments[surface->attachment_idx]; |
| const struct usc_mrt_resource *mrt_resource = |
| &hw_render->eot_setup.mrt_resources[surface->mrt_idx]; |
| uint32_t samples = 1; |
| |
| if (mrt_resource->type != resource_type) |
| continue; |
| |
| if (surface->need_resolve) { |
| const struct pvr_image_view *resolve_src = |
| render_pass_info->attachments[surface->src_attachment_idx]; |
| |
| /* Attachments that are the destination of resolve operations must |
| * be loaded before their next use. |
| */ |
| render_pass_info->enable_bg_tag = true; |
| render_pass_info->process_empty_tiles = true; |
| |
| if (surface->resolve_type != PVR_RESOLVE_TYPE_PBE) |
| continue; |
| |
| samples = (uint32_t)resolve_src->vk.image->samples; |
| } |
| |
| assert(emit_state->emit_count < ARRAY_SIZE(emit_state->pbe_cs_words)); |
| assert(emit_state->emit_count < ARRAY_SIZE(emit_state->pbe_reg_words)); |
| |
| pvr_setup_pbe_state(dev_info, |
| framebuffer, |
| emit_state->emit_count, |
| mrt_resource, |
| iview, |
| &render_pass_info->render_area, |
| surface->need_resolve, |
| samples, |
| emit_state->pbe_cs_words[emit_state->emit_count], |
| emit_state->pbe_reg_words[emit_state->emit_count]); |
| emit_state->emit_count += 1; |
| } |
| } |
| |
| assert(emit_state->emit_count == hw_render->pbe_emits); |
| } |
| |
| static inline bool |
| pvr_is_render_area_tile_aligned(const struct pvr_cmd_buffer *cmd_buffer, |
| const struct pvr_image_view *iview) |
| { |
| const VkRect2D *render_area = |
| &cmd_buffer->state.render_pass_info.render_area; |
| |
| return render_area->offset.x == 0 && render_area->offset.y == 0 && |
| render_area->extent.height == iview->vk.extent.height && |
| render_area->extent.width == iview->vk.extent.width; |
| } |
| |
| static VkResult pvr_sub_cmd_gfx_job_init(const struct pvr_device_info *dev_info, |
| struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd_gfx *sub_cmd) |
| { |
| static const VkClearDepthStencilValue default_ds_clear_value = { |
| .depth = 1.0f, |
| .stencil = 0xFFFFFFFF, |
| }; |
| |
| const struct vk_dynamic_graphics_state *dynamic_state = |
| &cmd_buffer->vk.dynamic_graphics_state; |
| struct pvr_render_pass_info *render_pass_info = |
| &cmd_buffer->state.render_pass_info; |
| const struct pvr_renderpass_hwsetup_render *hw_render = |
| &render_pass_info->pass->hw_setup->renders[sub_cmd->hw_render_idx]; |
| struct pvr_render_job *job = &sub_cmd->job; |
| struct pvr_pds_upload pds_pixel_event_program; |
| struct pvr_framebuffer *framebuffer = render_pass_info->framebuffer; |
| struct pvr_spm_bgobj_state *spm_bgobj_state = |
| &framebuffer->spm_bgobj_state_per_render[sub_cmd->hw_render_idx]; |
| struct pvr_render_target *render_target; |
| VkResult result; |
| |
| if (sub_cmd->barrier_store) { |
| /* There can only ever be one frag job running on the hardware at any one |
| * time, and a context switch is not allowed mid-tile, so instead of |
| * allocating a new scratch buffer we can reuse the SPM scratch buffer to |
| * perform the store. |
| * So use the SPM EOT program with the SPM PBE reg words in order to store |
| * the render to the SPM scratch buffer. |
| */ |
| |
| memcpy(job->pbe_reg_words, |
| &framebuffer->spm_eot_state_per_render[0].pbe_reg_words, |
| sizeof(job->pbe_reg_words)); |
| job->pds_pixel_event_data_offset = |
| framebuffer->spm_eot_state_per_render[0] |
| .pixel_event_program_data_offset; |
| } else { |
| struct pvr_emit_state emit_state = { 0 }; |
| |
| pvr_setup_emit_state(dev_info, hw_render, render_pass_info, &emit_state); |
| |
| memcpy(job->pbe_reg_words, |
| emit_state.pbe_reg_words, |
| sizeof(job->pbe_reg_words)); |
| |
| result = pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload( |
| cmd_buffer, |
| emit_state.emit_count, |
| emit_state.pbe_cs_words[0], |
| &pds_pixel_event_program); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| job->pds_pixel_event_data_offset = pds_pixel_event_program.data_offset; |
| } |
| |
| if (sub_cmd->barrier_load) { |
| job->enable_bg_tag = true; |
| job->process_empty_tiles = true; |
| |
| /* Load the previously stored render from the SPM scratch buffer. */ |
| |
| STATIC_ASSERT(ARRAY_SIZE(job->pds_bgnd_reg_values) == |
| ARRAY_SIZE(spm_bgobj_state->pds_reg_values)); |
| typed_memcpy(job->pds_bgnd_reg_values, |
| spm_bgobj_state->pds_reg_values, |
| ARRAY_SIZE(spm_bgobj_state->pds_reg_values)); |
| } else if (hw_render->load_op) { |
| const struct pvr_load_op *load_op = hw_render->load_op; |
| struct pvr_pds_upload load_op_program; |
| |
| /* Recalculate Background Object(s). */ |
| |
| /* FIXME: Should we free the PDS pixel event data or let it be freed |
| * when the pool gets emptied? |
| */ |
| result = pvr_load_op_data_create_and_upload(cmd_buffer, |
| load_op, |
| &load_op_program); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| job->enable_bg_tag = render_pass_info->enable_bg_tag; |
| job->process_empty_tiles = render_pass_info->process_empty_tiles; |
| |
| pvr_pds_bgnd_pack_state(load_op, |
| &load_op_program, |
| job->pds_bgnd_reg_values); |
| } |
| |
| /* TODO: In some cases a PR can be removed by storing to the color attachment |
| * and have the background object load directly from it instead of using the |
| * scratch buffer. In those cases we can also set this to "false" and avoid |
| * extra fw overhead. |
| */ |
| /* The scratch buffer is always needed and allocated to avoid data loss in |
| * case SPM is hit so set the flag unconditionally. |
| */ |
| job->requires_spm_scratch_buffer = true; |
| |
| memcpy(job->pr_pbe_reg_words, |
| &framebuffer->spm_eot_state_per_render[0].pbe_reg_words, |
| sizeof(job->pbe_reg_words)); |
| job->pr_pds_pixel_event_data_offset = |
| framebuffer->spm_eot_state_per_render[0].pixel_event_program_data_offset; |
| |
| STATIC_ASSERT(ARRAY_SIZE(job->pds_pr_bgnd_reg_values) == |
| ARRAY_SIZE(spm_bgobj_state->pds_reg_values)); |
| typed_memcpy(job->pds_pr_bgnd_reg_values, |
| spm_bgobj_state->pds_reg_values, |
| ARRAY_SIZE(spm_bgobj_state->pds_reg_values)); |
| |
| render_target = pvr_get_render_target(render_pass_info->pass, |
| framebuffer, |
| sub_cmd->hw_render_idx); |
| job->rt_dataset = render_target->rt_dataset; |
| |
| job->ctrl_stream_addr = pvr_csb_get_start_address(&sub_cmd->control_stream); |
| |
| if (sub_cmd->depth_bias_bo) |
| job->depth_bias_table_addr = sub_cmd->depth_bias_bo->dev_addr; |
| else |
| job->depth_bias_table_addr = PVR_DEV_ADDR_INVALID; |
| |
| if (sub_cmd->scissor_bo) |
| job->scissor_table_addr = sub_cmd->scissor_bo->dev_addr; |
| else |
| job->scissor_table_addr = PVR_DEV_ADDR_INVALID; |
| |
| job->pixel_output_width = |
| pvr_pass_get_pixel_output_width(render_pass_info->pass, |
| sub_cmd->hw_render_idx, |
| dev_info); |
| |
| /* Setup depth/stencil job information. */ |
| if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) { |
| struct pvr_image_view *ds_iview = |
| render_pass_info->attachments[hw_render->ds_attach_idx]; |
| const struct pvr_image *ds_image = pvr_image_view_get_image(ds_iview); |
| |
| job->has_depth_attachment = vk_format_has_depth(ds_image->vk.format); |
| job->has_stencil_attachment = vk_format_has_stencil(ds_image->vk.format); |
| |
| if (job->has_depth_attachment || job->has_stencil_attachment) { |
| uint32_t level_pitch = |
| ds_image->mip_levels[ds_iview->vk.base_mip_level].pitch; |
| const bool render_area_is_tile_aligned = |
| pvr_is_render_area_tile_aligned(cmd_buffer, ds_iview); |
| bool store_was_optimised_out = false; |
| bool d_store = false, s_store = false; |
| bool d_load = false, s_load = false; |
| |
| job->ds.iview = ds_iview; |
| job->ds.addr = ds_image->dev_addr; |
| |
| job->ds.stride = |
| pvr_stride_from_pitch(level_pitch, ds_iview->vk.format); |
| job->ds.height = ds_iview->vk.extent.height; |
| job->ds.physical_extent = (VkExtent2D){ |
| .width = u_minify(ds_image->physical_extent.width, |
| ds_iview->vk.base_mip_level), |
| .height = u_minify(ds_image->physical_extent.height, |
| ds_iview->vk.base_mip_level), |
| }; |
| job->ds.layer_size = ds_image->layer_size; |
| |
| job->ds_clear_value = default_ds_clear_value; |
| |
| if (hw_render->ds_attach_idx < render_pass_info->clear_value_count) { |
| const VkClearDepthStencilValue *const clear_values = |
| &render_pass_info->clear_values[hw_render->ds_attach_idx] |
| .depthStencil; |
| |
| if (job->has_depth_attachment) |
| job->ds_clear_value.depth = clear_values->depth; |
| |
| if (job->has_stencil_attachment) |
| job->ds_clear_value.stencil = clear_values->stencil; |
| } |
| |
| switch (ds_iview->vk.format) { |
| case VK_FORMAT_D16_UNORM: |
| job->ds.zls_format = ROGUE_CR_ZLS_FORMAT_TYPE_16BITINT; |
| break; |
| |
| case VK_FORMAT_S8_UINT: |
| case VK_FORMAT_D32_SFLOAT: |
| job->ds.zls_format = ROGUE_CR_ZLS_FORMAT_TYPE_F32Z; |
| break; |
| |
| case VK_FORMAT_D24_UNORM_S8_UINT: |
| job->ds.zls_format = ROGUE_CR_ZLS_FORMAT_TYPE_24BITINT; |
| break; |
| |
| default: |
| unreachable("Unsupported depth stencil format"); |
| } |
| |
| job->ds.memlayout = ds_image->memlayout; |
| |
| if (job->has_depth_attachment) { |
| if (hw_render->depth_store || sub_cmd->barrier_store) { |
| const bool depth_init_is_clear = hw_render->depth_init == |
| VK_ATTACHMENT_LOAD_OP_CLEAR; |
| |
| d_store = true; |
| |
| if (hw_render->depth_store && render_area_is_tile_aligned && |
| !(sub_cmd->modifies_depth || depth_init_is_clear)) { |
| d_store = false; |
| store_was_optimised_out = true; |
| } |
| } |
| |
| if (d_store && !render_area_is_tile_aligned) { |
| d_load = true; |
| } else if (hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_LOAD) { |
| enum pvr_depth_stencil_usage depth_usage = sub_cmd->depth_usage; |
| |
| assert(depth_usage != PVR_DEPTH_STENCIL_USAGE_UNDEFINED); |
| d_load = (depth_usage != PVR_DEPTH_STENCIL_USAGE_NEVER); |
| } else { |
| d_load = sub_cmd->barrier_load; |
| } |
| } |
| |
| if (job->has_stencil_attachment) { |
| if (hw_render->stencil_store || sub_cmd->barrier_store) { |
| const bool stencil_init_is_clear = hw_render->stencil_init == |
| VK_ATTACHMENT_LOAD_OP_CLEAR; |
| |
| s_store = true; |
| |
| if (hw_render->stencil_store && render_area_is_tile_aligned && |
| !(sub_cmd->modifies_stencil || stencil_init_is_clear)) { |
| s_store = false; |
| store_was_optimised_out = true; |
| } |
| } |
| |
| if (s_store && !render_area_is_tile_aligned) { |
| s_load = true; |
| } else if (hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_LOAD) { |
| enum pvr_depth_stencil_usage stencil_usage = |
| sub_cmd->stencil_usage; |
| |
| assert(stencil_usage != PVR_DEPTH_STENCIL_USAGE_UNDEFINED); |
| s_load = (stencil_usage != PVR_DEPTH_STENCIL_USAGE_NEVER); |
| } else { |
| s_load = sub_cmd->barrier_load; |
| } |
| } |
| |
| job->ds.load.d = d_load; |
| job->ds.load.s = s_load; |
| job->ds.store.d = d_store; |
| job->ds.store.s = s_store; |
| |
| /* ZLS can't do masked writes for packed depth stencil formats so if |
| * we store anything, we have to store everything. |
| */ |
| if ((job->ds.store.d || job->ds.store.s) && |
| pvr_zls_format_type_is_packed(job->ds.zls_format)) { |
| job->ds.store.d = true; |
| job->ds.store.s = true; |
| |
| /* In case we are only operating on one aspect of the attachment we |
| * need to load the unused one in order to preserve its contents due |
| * to the forced store which might otherwise corrupt it. |
| */ |
| if (hw_render->depth_init != VK_ATTACHMENT_LOAD_OP_CLEAR) |
| job->ds.load.d = true; |
| |
| if (hw_render->stencil_init != VK_ATTACHMENT_LOAD_OP_CLEAR) |
| job->ds.load.s = true; |
| } |
| |
| if (pvr_ds_attachment_requires_zls(&job->ds) || |
| store_was_optimised_out) { |
| job->process_empty_tiles = true; |
| } |
| |
| if (pvr_sub_cmd_gfx_requires_ds_subtile_alignment(dev_info, job)) { |
| result = pvr_sub_cmd_gfx_align_ds_subtiles(cmd_buffer, sub_cmd); |
| if (result != VK_SUCCESS) |
| return result; |
| } |
| } |
| } else { |
| job->has_depth_attachment = false; |
| job->has_stencil_attachment = false; |
| job->ds_clear_value = default_ds_clear_value; |
| } |
| |
| if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) { |
| struct pvr_image_view *iview = |
| render_pass_info->attachments[hw_render->ds_attach_idx]; |
| const struct pvr_image *image = pvr_image_view_get_image(iview); |
| |
| /* If the HW render pass has a valid depth/stencil surface, determine the |
| * sample count from the attachment's image. |
| */ |
| job->samples = image->vk.samples; |
| } else if (hw_render->output_regs_count) { |
| /* If the HW render pass has output registers, we have color attachments |
| * to write to, so determine the sample count from the count specified for |
| * every color attachment in this render. |
| */ |
| job->samples = hw_render->sample_count; |
| } else if (cmd_buffer->state.gfx_pipeline) { |
| /* If the HW render pass has no color or depth/stencil attachments, we |
| * determine the sample count from the count given during pipeline |
| * creation. |
| */ |
| job->samples = dynamic_state->ms.rasterization_samples; |
| } else if (render_pass_info->pass->attachment_count > 0) { |
| /* If we get here, we have a render pass with subpasses containing no |
| * attachments. The next best thing is largest of the sample counts |
| * specified by the render pass attachment descriptions. |
| */ |
| job->samples = render_pass_info->pass->max_sample_count; |
| } else { |
| /* No appropriate framebuffer attachment is available. */ |
| mesa_logw("Defaulting render job sample count to 1."); |
| job->samples = VK_SAMPLE_COUNT_1_BIT; |
| } |
| |
| if (sub_cmd->max_tiles_in_flight == |
| PVR_GET_FEATURE_VALUE(dev_info, isp_max_tiles_in_flight, 1U)) { |
| /* Use the default limit based on the partition store. */ |
| job->max_tiles_in_flight = 0U; |
| } else { |
| job->max_tiles_in_flight = sub_cmd->max_tiles_in_flight; |
| } |
| |
| job->frag_uses_atomic_ops = sub_cmd->frag_uses_atomic_ops; |
| job->disable_compute_overlap = false; |
| job->max_shared_registers = cmd_buffer->state.max_shared_regs; |
| job->run_frag = true; |
| job->geometry_terminate = true; |
| |
| /* TODO: Enable pixel merging when it's safe to do. */ |
| job->disable_pixel_merging = true; |
| |
| return VK_SUCCESS; |
| } |
| |
| static void |
| pvr_sub_cmd_compute_job_init(const struct pvr_physical_device *pdevice, |
| struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd_compute *sub_cmd) |
| { |
| sub_cmd->num_shared_regs = MAX2(cmd_buffer->device->idfwdf_state.usc_shareds, |
| cmd_buffer->state.max_shared_regs); |
| |
| cmd_buffer->state.max_shared_regs = 0U; |
| } |
| |
| #define PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS \ |
| (1024 / ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE) |
| |
| static uint32_t |
| pvr_compute_flat_slot_size(const struct pvr_physical_device *pdevice, |
| uint32_t coeff_regs_count, |
| bool use_barrier, |
| uint32_t total_workitems) |
| { |
| const struct pvr_device_runtime_info *dev_runtime_info = |
| &pdevice->dev_runtime_info; |
| const struct pvr_device_info *dev_info = &pdevice->dev_info; |
| uint32_t max_workgroups_per_task = ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK; |
| uint32_t max_avail_coeff_regs = |
| dev_runtime_info->cdm_max_local_mem_size_regs; |
| uint32_t localstore_chunks_count = |
| DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs_count), |
| ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE); |
| |
| /* Ensure that we cannot have more workgroups in a slot than the available |
| * number of coefficients allow us to have. |
| */ |
| if (coeff_regs_count > 0U) { |
| /* If the geometry or fragment jobs can overlap with the compute job, or |
| * if there is a vertex shader already running then we need to consider |
| * this in calculating max allowed work-groups. |
| */ |
| if (PVR_HAS_QUIRK(dev_info, 52354) && |
| (PVR_HAS_FEATURE(dev_info, compute_overlap) || |
| PVR_HAS_FEATURE(dev_info, gs_rta_support))) { |
| /* Solve for n (number of work-groups per task). All values are in |
| * size of common store alloc blocks: |
| * |
| * n + (2n + 7) * (local_memory_size_max - 1) = |
| * (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max) |
| * ==> |
| * n + 2n * (local_memory_size_max - 1) = |
| * (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max) |
| * - (7 * (local_memory_size_max - 1)) |
| * ==> |
| * n * (1 + 2 * (local_memory_size_max - 1)) = |
| * (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max) |
| * - (7 * (local_memory_size_max - 1)) |
| * ==> |
| * n = ((coefficient_memory_pool_size) - |
| * (7 * pixel_allocation_size_max) - |
| * (7 * (local_memory_size_max - 1)) / (1 + |
| * 2 * (local_memory_size_max - 1))) |
| */ |
| uint32_t max_common_store_blocks = |
| DIV_ROUND_UP(max_avail_coeff_regs * 4U, |
| ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE); |
| |
| /* (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max) |
| */ |
| max_common_store_blocks -= ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES * |
| PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS; |
| |
| /* - (7 * (local_memory_size_max - 1)) */ |
| max_common_store_blocks -= (ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES * |
| (localstore_chunks_count - 1U)); |
| |
| /* Divide by (1 + 2 * (local_memory_size_max - 1)) */ |
| max_workgroups_per_task = max_common_store_blocks / |
| (1U + 2U * (localstore_chunks_count - 1U)); |
| |
| max_workgroups_per_task = |
| MIN2(max_workgroups_per_task, |
| ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK); |
| |
| } else { |
| max_workgroups_per_task = |
| MIN2((max_avail_coeff_regs / coeff_regs_count), |
| max_workgroups_per_task); |
| } |
| } |
| |
| /* max_workgroups_per_task should at least be one. */ |
| assert(max_workgroups_per_task >= 1U); |
| |
| if (total_workitems >= ROGUE_MAX_INSTANCES_PER_TASK) { |
| /* In this case, the work group size will have been padded up to the |
| * next ROGUE_MAX_INSTANCES_PER_TASK so we just set max instances to be |
| * ROGUE_MAX_INSTANCES_PER_TASK. |
| */ |
| return ROGUE_MAX_INSTANCES_PER_TASK; |
| } |
| |
| /* In this case, the number of instances in the slot must be clamped to |
| * accommodate whole work-groups only. |
| */ |
| if (PVR_HAS_QUIRK(dev_info, 49032) || use_barrier) { |
| max_workgroups_per_task = |
| MIN2(max_workgroups_per_task, |
| ROGUE_MAX_INSTANCES_PER_TASK / total_workitems); |
| return total_workitems * max_workgroups_per_task; |
| } |
| |
| return MIN2(total_workitems * max_workgroups_per_task, |
| ROGUE_MAX_INSTANCES_PER_TASK); |
| } |
| |
| static void |
| pvr_compute_generate_control_stream(struct pvr_csb *csb, |
| struct pvr_sub_cmd_compute *sub_cmd, |
| const struct pvr_compute_kernel_info *info) |
| { |
| pvr_csb_set_relocation_mark(csb); |
| |
| /* Compute kernel 0. */ |
| pvr_csb_emit (csb, CDMCTRL_KERNEL0, kernel0) { |
| kernel0.indirect_present = !!info->indirect_buffer_addr.addr; |
| kernel0.global_offsets_present = info->global_offsets_present; |
| kernel0.usc_common_size = info->usc_common_size; |
| kernel0.usc_unified_size = info->usc_unified_size; |
| kernel0.pds_temp_size = info->pds_temp_size; |
| kernel0.pds_data_size = info->pds_data_size; |
| kernel0.usc_target = info->usc_target; |
| kernel0.fence = info->is_fence; |
| } |
| |
| /* Compute kernel 1. */ |
| pvr_csb_emit (csb, CDMCTRL_KERNEL1, kernel1) { |
| kernel1.data_addr = PVR_DEV_ADDR(info->pds_data_offset); |
| kernel1.sd_type = info->sd_type; |
| kernel1.usc_common_shared = info->usc_common_shared; |
| } |
| |
| /* Compute kernel 2. */ |
| pvr_csb_emit (csb, CDMCTRL_KERNEL2, kernel2) { |
| kernel2.code_addr = PVR_DEV_ADDR(info->pds_code_offset); |
| } |
| |
| if (info->indirect_buffer_addr.addr) { |
| /* Compute kernel 6. */ |
| pvr_csb_emit (csb, CDMCTRL_KERNEL6, kernel6) { |
| kernel6.indirect_addrmsb = info->indirect_buffer_addr; |
| } |
| |
| /* Compute kernel 7. */ |
| pvr_csb_emit (csb, CDMCTRL_KERNEL7, kernel7) { |
| kernel7.indirect_addrlsb = info->indirect_buffer_addr; |
| } |
| } else { |
| /* Compute kernel 3. */ |
| pvr_csb_emit (csb, CDMCTRL_KERNEL3, kernel3) { |
| assert(info->global_size[0U] > 0U); |
| kernel3.workgroup_x = info->global_size[0U] - 1U; |
| } |
| |
| /* Compute kernel 4. */ |
| pvr_csb_emit (csb, CDMCTRL_KERNEL4, kernel4) { |
| assert(info->global_size[1U] > 0U); |
| kernel4.workgroup_y = info->global_size[1U] - 1U; |
| } |
| |
| /* Compute kernel 5. */ |
| pvr_csb_emit (csb, CDMCTRL_KERNEL5, kernel5) { |
| assert(info->global_size[2U] > 0U); |
| kernel5.workgroup_z = info->global_size[2U] - 1U; |
| } |
| } |
| |
| /* Compute kernel 8. */ |
| pvr_csb_emit (csb, CDMCTRL_KERNEL8, kernel8) { |
| if (info->max_instances == ROGUE_MAX_INSTANCES_PER_TASK) |
| kernel8.max_instances = 0U; |
| else |
| kernel8.max_instances = info->max_instances; |
| |
| assert(info->local_size[0U] > 0U); |
| kernel8.workgroup_size_x = info->local_size[0U] - 1U; |
| assert(info->local_size[1U] > 0U); |
| kernel8.workgroup_size_y = info->local_size[1U] - 1U; |
| assert(info->local_size[2U] > 0U); |
| kernel8.workgroup_size_z = info->local_size[2U] - 1U; |
| } |
| |
| pvr_csb_clear_relocation_mark(csb); |
| |
| /* Track the highest amount of shared registers usage in this dispatch. |
| * This is used by the FW for context switching, so must be large enough |
| * to contain all the shared registers that might be in use for this compute |
| * job. Coefficients don't need to be included as the context switch will not |
| * happen within the execution of a single workgroup, thus nothing needs to |
| * be preserved. |
| */ |
| if (info->usc_common_shared) { |
| sub_cmd->num_shared_regs = |
| MAX2(sub_cmd->num_shared_regs, info->usc_common_size); |
| } |
| } |
| |
| /* TODO: This can be pre-packed and uploaded directly. Would that provide any |
| * speed up? |
| */ |
| static void |
| pvr_compute_generate_idfwdf(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd_compute *const sub_cmd) |
| { |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| bool *const is_sw_barrier_required = |
| &state->current_sub_cmd->compute.pds_sw_barrier_requires_clearing; |
| const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice; |
| struct pvr_csb *csb = &sub_cmd->control_stream; |
| const struct pvr_pds_upload *program; |
| |
| if (PVR_NEED_SW_COMPUTE_PDS_BARRIER(&pdevice->dev_info) && |
| *is_sw_barrier_required) { |
| *is_sw_barrier_required = false; |
| program = &cmd_buffer->device->idfwdf_state.sw_compute_barrier_pds; |
| } else { |
| program = &cmd_buffer->device->idfwdf_state.pds; |
| } |
| |
| struct pvr_compute_kernel_info info = { |
| .indirect_buffer_addr = PVR_DEV_ADDR_INVALID, |
| .global_offsets_present = false, |
| .usc_common_size = DIV_ROUND_UP( |
| PVR_DW_TO_BYTES(cmd_buffer->device->idfwdf_state.usc_shareds), |
| ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE), |
| .usc_unified_size = 0U, |
| .pds_temp_size = 0U, |
| .pds_data_size = |
| DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size), |
| ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE), |
| .usc_target = ROGUE_CDMCTRL_USC_TARGET_ALL, |
| .is_fence = false, |
| .pds_data_offset = program->data_offset, |
| .sd_type = ROGUE_CDMCTRL_SD_TYPE_USC, |
| .usc_common_shared = true, |
| .pds_code_offset = program->code_offset, |
| .global_size = { 1U, 1U, 1U }, |
| .local_size = { 1U, 1U, 1U }, |
| }; |
| |
| /* We don't need to pad work-group size for this case. */ |
| |
| info.max_instances = |
| pvr_compute_flat_slot_size(pdevice, |
| cmd_buffer->device->idfwdf_state.usc_shareds, |
| false, |
| 1U); |
| |
| pvr_compute_generate_control_stream(csb, sub_cmd, &info); |
| } |
| |
| void pvr_compute_generate_fence(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd_compute *const sub_cmd, |
| bool deallocate_shareds) |
| { |
| const struct pvr_pds_upload *program = |
| &cmd_buffer->device->pds_compute_fence_program; |
| const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice; |
| struct pvr_csb *csb = &sub_cmd->control_stream; |
| |
| struct pvr_compute_kernel_info info = { |
| .indirect_buffer_addr = PVR_DEV_ADDR_INVALID, |
| .global_offsets_present = false, |
| .usc_common_size = 0U, |
| .usc_unified_size = 0U, |
| .pds_temp_size = 0U, |
| .pds_data_size = |
| DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size), |
| ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE), |
| .usc_target = ROGUE_CDMCTRL_USC_TARGET_ANY, |
| .is_fence = true, |
| .pds_data_offset = program->data_offset, |
| .sd_type = ROGUE_CDMCTRL_SD_TYPE_PDS, |
| .usc_common_shared = deallocate_shareds, |
| .pds_code_offset = program->code_offset, |
| .global_size = { 1U, 1U, 1U }, |
| .local_size = { 1U, 1U, 1U }, |
| }; |
| |
| /* We don't need to pad work-group size for this case. */ |
| /* Here we calculate the slot size. This can depend on the use of barriers, |
| * local memory, BRN's or other factors. |
| */ |
| info.max_instances = pvr_compute_flat_slot_size(pdevice, 0U, false, 1U); |
| |
| pvr_compute_generate_control_stream(csb, sub_cmd, &info); |
| } |
| |
| static VkResult |
| pvr_cmd_buffer_process_deferred_clears(struct pvr_cmd_buffer *cmd_buffer) |
| { |
| util_dynarray_foreach (&cmd_buffer->deferred_clears, |
| struct pvr_transfer_cmd, |
| transfer_cmd) { |
| VkResult result; |
| |
| result = pvr_cmd_buffer_add_transfer_cmd(cmd_buffer, transfer_cmd); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| cmd_buffer->state.current_sub_cmd->transfer.serialize_with_frag = true; |
| } |
| |
| return VK_SUCCESS; |
| } |
| |
| VkResult pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer *cmd_buffer) |
| { |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| struct pvr_sub_cmd *sub_cmd = state->current_sub_cmd; |
| struct pvr_device *device = cmd_buffer->device; |
| const struct pvr_query_pool *query_pool = NULL; |
| struct pvr_suballoc_bo *query_bo = NULL; |
| size_t query_indices_size = 0; |
| VkResult result; |
| |
| /* FIXME: Is this NULL check required because this function is called from |
| * pvr_resolve_unemitted_resolve_attachments()? See comment about this |
| * function being called twice in a row in pvr_CmdEndRenderPass(). |
| */ |
| if (!sub_cmd) |
| return VK_SUCCESS; |
| |
| if (!sub_cmd->owned) { |
| state->current_sub_cmd = NULL; |
| return VK_SUCCESS; |
| } |
| |
| switch (sub_cmd->type) { |
| case PVR_SUB_CMD_TYPE_GRAPHICS: { |
| struct pvr_sub_cmd_gfx *const gfx_sub_cmd = &sub_cmd->gfx; |
| |
| query_indices_size = |
| util_dynarray_num_elements(&state->query_indices, char); |
| |
| if (query_indices_size > 0) { |
| const bool secondary_cont = |
| cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY && |
| cmd_buffer->usage_flags & |
| VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT; |
| |
| assert(gfx_sub_cmd->query_pool); |
| |
| if (secondary_cont) { |
| util_dynarray_append_dynarray(&state->query_indices, |
| &gfx_sub_cmd->sec_query_indices); |
| } else { |
| const void *data = util_dynarray_begin(&state->query_indices); |
| |
| result = pvr_cmd_buffer_upload_general(cmd_buffer, |
| data, |
| query_indices_size, |
| &query_bo); |
| if (result != VK_SUCCESS) |
| return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result); |
| |
| query_pool = gfx_sub_cmd->query_pool; |
| } |
| |
| gfx_sub_cmd->has_occlusion_query = true; |
| |
| util_dynarray_clear(&state->query_indices); |
| } |
| |
| if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { |
| result = pvr_csb_emit_return(&gfx_sub_cmd->control_stream); |
| if (result != VK_SUCCESS) |
| return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result); |
| |
| break; |
| } |
| |
| /* TODO: Check if the sub_cmd can be skipped based on |
| * sub_cmd->gfx.empty_cmd flag. |
| */ |
| |
| /* TODO: Set the state in the functions called with the command buffer |
| * instead of here. |
| */ |
| |
| result = pvr_cmd_buffer_upload_tables(device, cmd_buffer, gfx_sub_cmd); |
| if (result != VK_SUCCESS) |
| return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result); |
| |
| result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer, |
| &gfx_sub_cmd->control_stream); |
| if (result != VK_SUCCESS) |
| return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result); |
| |
| result = pvr_csb_emit_terminate(&gfx_sub_cmd->control_stream); |
| if (result != VK_SUCCESS) |
| return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result); |
| |
| result = pvr_sub_cmd_gfx_job_init(&device->pdevice->dev_info, |
| cmd_buffer, |
| gfx_sub_cmd); |
| if (result != VK_SUCCESS) |
| return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result); |
| |
| if (pvr_sub_cmd_gfx_requires_split_submit(gfx_sub_cmd)) { |
| result = pvr_sub_cmd_gfx_build_terminate_ctrl_stream(device, |
| cmd_buffer, |
| gfx_sub_cmd); |
| if (result != VK_SUCCESS) |
| return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result); |
| } |
| |
| break; |
| } |
| |
| case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY: |
| case PVR_SUB_CMD_TYPE_COMPUTE: { |
| struct pvr_sub_cmd_compute *const compute_sub_cmd = &sub_cmd->compute; |
| |
| pvr_compute_generate_fence(cmd_buffer, compute_sub_cmd, true); |
| |
| result = pvr_csb_emit_terminate(&compute_sub_cmd->control_stream); |
| if (result != VK_SUCCESS) |
| return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result); |
| |
| pvr_sub_cmd_compute_job_init(device->pdevice, |
| cmd_buffer, |
| compute_sub_cmd); |
| break; |
| } |
| |
| case PVR_SUB_CMD_TYPE_TRANSFER: |
| break; |
| |
| case PVR_SUB_CMD_TYPE_EVENT: |
| break; |
| |
| default: |
| unreachable("Unsupported sub-command type"); |
| } |
| |
| state->current_sub_cmd = NULL; |
| |
| /* pvr_cmd_buffer_process_deferred_clears() must be called with a NULL |
| * current_sub_cmd. |
| * |
| * We can start a sub_cmd of a different type from the current sub_cmd only |
| * after having ended the current sub_cmd. However, we can't end the current |
| * sub_cmd if this depends on starting sub_cmd(s) of a different type. Hence, |
| * don't try to start transfer sub_cmd(s) with |
| * pvr_cmd_buffer_process_deferred_clears() until the current hasn't ended. |
| * Failing to do so we will cause a circular dependency between |
| * pvr_cmd_buffer_{end,start}_cmd and blow the stack. |
| */ |
| if (sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS) { |
| result = pvr_cmd_buffer_process_deferred_clears(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result); |
| } |
| |
| if (query_pool) { |
| struct pvr_query_info query_info; |
| |
| assert(query_bo); |
| assert(query_indices_size); |
| |
| query_info.type = PVR_QUERY_TYPE_AVAILABILITY_WRITE; |
| |
| /* sizeof(uint32_t) is for the size of single query. */ |
| query_info.availability_write.num_query_indices = |
| query_indices_size / sizeof(uint32_t); |
| query_info.availability_write.index_bo = query_bo; |
| |
| query_info.availability_write.num_queries = query_pool->query_count; |
| query_info.availability_write.availability_bo = |
| query_pool->availability_buffer; |
| |
| /* Insert a barrier after the graphics sub command and before the |
| * query sub command so that the availability write program waits for the |
| * fragment shader to complete. |
| */ |
| |
| result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){ |
| .type = PVR_EVENT_TYPE_BARRIER, |
| .barrier = { |
| .wait_for_stage_mask = PVR_PIPELINE_STAGE_FRAG_BIT, |
| .wait_at_stage_mask = PVR_PIPELINE_STAGE_OCCLUSION_QUERY_BIT, |
| }, |
| }; |
| |
| return pvr_add_query_program(cmd_buffer, &query_info); |
| } |
| |
| return VK_SUCCESS; |
| } |
| |
| void pvr_reset_graphics_dirty_state(struct pvr_cmd_buffer *const cmd_buffer, |
| bool start_geom) |
| { |
| struct vk_dynamic_graphics_state *const dynamic_state = |
| &cmd_buffer->vk.dynamic_graphics_state; |
| |
| if (start_geom) { |
| /* |
| * Initial geometry phase state. |
| * It's the driver's responsibility to ensure that the state of the |
| * hardware is correctly initialized at the start of every geometry |
| * phase. This is required to prevent stale state from a previous |
| * geometry phase erroneously affecting the next geometry phase. |
| * |
| * If a geometry phase does not contain any geometry, this restriction |
| * can be ignored. If the first draw call in a geometry phase will only |
| * update the depth or stencil buffers i.e. ISP_TAGWRITEDISABLE is set |
| * in the ISP State Control Word, the PDS State Pointers |
| * (TA_PRES_PDSSTATEPTR*) in the first PPP State Update do not need to |
| * be supplied, since they will never reach the PDS in the fragment |
| * phase. |
| */ |
| |
| cmd_buffer->state.emit_header = (struct ROGUE_TA_STATE_HEADER){ |
| .pres_stream_out_size = true, |
| .pres_ppp_ctrl = true, |
| .pres_varying_word2 = true, |
| .pres_varying_word1 = true, |
| .pres_varying_word0 = true, |
| .pres_outselects = true, |
| .pres_wclamp = true, |
| .pres_viewport = true, |
| .pres_region_clip = true, |
| .pres_pds_state_ptr0 = true, |
| .pres_ispctl_fb = true, |
| .pres_ispctl = true, |
| }; |
| } else { |
| struct ROGUE_TA_STATE_HEADER *const emit_header = |
| &cmd_buffer->state.emit_header; |
| |
| emit_header->pres_ppp_ctrl = true; |
| emit_header->pres_varying_word1 = true; |
| emit_header->pres_varying_word0 = true; |
| emit_header->pres_outselects = true; |
| emit_header->pres_viewport = true; |
| emit_header->pres_region_clip = true; |
| emit_header->pres_pds_state_ptr0 = true; |
| emit_header->pres_ispctl_fb = true; |
| emit_header->pres_ispctl = true; |
| } |
| |
| memset(&cmd_buffer->state.ppp_state, |
| 0U, |
| sizeof(cmd_buffer->state.ppp_state)); |
| |
| cmd_buffer->state.dirty.vertex_bindings = true; |
| cmd_buffer->state.dirty.gfx_pipeline_binding = true; |
| |
| BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS); |
| BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT); |
| } |
| |
| static inline bool |
| pvr_cmd_uses_deferred_cs_cmds(const struct pvr_cmd_buffer *const cmd_buffer) |
| { |
| const VkCommandBufferUsageFlags deferred_control_stream_flags = |
| VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT | |
| VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; |
| |
| return cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY && |
| (cmd_buffer->usage_flags & deferred_control_stream_flags) == |
| deferred_control_stream_flags; |
| } |
| |
| VkResult pvr_cmd_buffer_start_sub_cmd(struct pvr_cmd_buffer *cmd_buffer, |
| enum pvr_sub_cmd_type type) |
| { |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| struct pvr_device *device = cmd_buffer->device; |
| struct pvr_sub_cmd *sub_cmd; |
| VkResult result; |
| |
| /* Check the current status of the buffer. */ |
| if (vk_command_buffer_has_error(&cmd_buffer->vk)) |
| return vk_command_buffer_get_record_result(&cmd_buffer->vk); |
| |
| pvr_cmd_buffer_update_barriers(cmd_buffer, type); |
| |
| /* TODO: Add proper support for joining consecutive event sub_cmd? */ |
| if (state->current_sub_cmd) { |
| if (state->current_sub_cmd->type == type) { |
| /* Continue adding to the current sub command. */ |
| return VK_SUCCESS; |
| } |
| |
| /* End the current sub command. */ |
| result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return result; |
| } |
| |
| sub_cmd = vk_zalloc(&cmd_buffer->vk.pool->alloc, |
| sizeof(*sub_cmd), |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); |
| if (!sub_cmd) { |
| return vk_command_buffer_set_error(&cmd_buffer->vk, |
| VK_ERROR_OUT_OF_HOST_MEMORY); |
| } |
| |
| sub_cmd->type = type; |
| sub_cmd->owned = true; |
| |
| switch (type) { |
| case PVR_SUB_CMD_TYPE_GRAPHICS: |
| sub_cmd->gfx.depth_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED; |
| sub_cmd->gfx.stencil_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED; |
| sub_cmd->gfx.modifies_depth = false; |
| sub_cmd->gfx.modifies_stencil = false; |
| sub_cmd->gfx.max_tiles_in_flight = |
| PVR_GET_FEATURE_VALUE(&device->pdevice->dev_info, |
| isp_max_tiles_in_flight, |
| 1); |
| sub_cmd->gfx.hw_render_idx = state->render_pass_info.current_hw_subpass; |
| sub_cmd->gfx.framebuffer = state->render_pass_info.framebuffer; |
| sub_cmd->gfx.empty_cmd = true; |
| |
| if (state->vis_test_enabled) |
| sub_cmd->gfx.query_pool = state->query_pool; |
| |
| pvr_reset_graphics_dirty_state(cmd_buffer, true); |
| |
| if (pvr_cmd_uses_deferred_cs_cmds(cmd_buffer)) { |
| pvr_csb_init(device, |
| PVR_CMD_STREAM_TYPE_GRAPHICS_DEFERRED, |
| &sub_cmd->gfx.control_stream); |
| } else { |
| pvr_csb_init(device, |
| PVR_CMD_STREAM_TYPE_GRAPHICS, |
| &sub_cmd->gfx.control_stream); |
| } |
| |
| util_dynarray_init(&sub_cmd->gfx.sec_query_indices, NULL); |
| break; |
| |
| case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY: |
| case PVR_SUB_CMD_TYPE_COMPUTE: |
| pvr_csb_init(device, |
| PVR_CMD_STREAM_TYPE_COMPUTE, |
| &sub_cmd->compute.control_stream); |
| break; |
| |
| case PVR_SUB_CMD_TYPE_TRANSFER: |
| sub_cmd->transfer.transfer_cmds = &sub_cmd->transfer.transfer_cmds_priv; |
| list_inithead(sub_cmd->transfer.transfer_cmds); |
| break; |
| |
| case PVR_SUB_CMD_TYPE_EVENT: |
| break; |
| |
| default: |
| unreachable("Unsupported sub-command type"); |
| } |
| |
| list_addtail(&sub_cmd->link, &cmd_buffer->sub_cmds); |
| state->current_sub_cmd = sub_cmd; |
| |
| return VK_SUCCESS; |
| } |
| |
| VkResult pvr_cmd_buffer_alloc_mem(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_winsys_heap *heap, |
| uint64_t size, |
| struct pvr_suballoc_bo **const pvr_bo_out) |
| { |
| const uint32_t cache_line_size = |
| rogue_get_slc_cache_line_size(&cmd_buffer->device->pdevice->dev_info); |
| struct pvr_suballoc_bo *suballoc_bo; |
| struct pvr_suballocator *allocator; |
| VkResult result; |
| |
| if (heap == cmd_buffer->device->heaps.general_heap) |
| allocator = &cmd_buffer->device->suballoc_general; |
| else if (heap == cmd_buffer->device->heaps.pds_heap) |
| allocator = &cmd_buffer->device->suballoc_pds; |
| else if (heap == cmd_buffer->device->heaps.transfer_frag_heap) |
| allocator = &cmd_buffer->device->suballoc_transfer; |
| else if (heap == cmd_buffer->device->heaps.usc_heap) |
| allocator = &cmd_buffer->device->suballoc_usc; |
| else |
| unreachable("Unknown heap type"); |
| |
| result = |
| pvr_bo_suballoc(allocator, size, cache_line_size, false, &suballoc_bo); |
| if (result != VK_SUCCESS) |
| return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result); |
| |
| list_add(&suballoc_bo->link, &cmd_buffer->bo_list); |
| |
| *pvr_bo_out = suballoc_bo; |
| |
| return VK_SUCCESS; |
| } |
| |
| static void pvr_cmd_bind_compute_pipeline( |
| const struct pvr_compute_pipeline *const compute_pipeline, |
| struct pvr_cmd_buffer *const cmd_buffer) |
| { |
| cmd_buffer->state.compute_pipeline = compute_pipeline; |
| cmd_buffer->state.dirty.compute_pipeline_binding = true; |
| } |
| |
| static void pvr_cmd_bind_graphics_pipeline( |
| const struct pvr_graphics_pipeline *const gfx_pipeline, |
| struct pvr_cmd_buffer *const cmd_buffer) |
| { |
| cmd_buffer->state.gfx_pipeline = gfx_pipeline; |
| cmd_buffer->state.dirty.gfx_pipeline_binding = true; |
| |
| vk_cmd_set_dynamic_graphics_state(&cmd_buffer->vk, |
| &gfx_pipeline->dynamic_state); |
| } |
| |
| void pvr_CmdBindPipeline(VkCommandBuffer commandBuffer, |
| VkPipelineBindPoint pipelineBindPoint, |
| VkPipeline _pipeline) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline); |
| |
| switch (pipelineBindPoint) { |
| case VK_PIPELINE_BIND_POINT_COMPUTE: |
| pvr_cmd_bind_compute_pipeline(to_pvr_compute_pipeline(pipeline), |
| cmd_buffer); |
| break; |
| |
| case VK_PIPELINE_BIND_POINT_GRAPHICS: |
| pvr_cmd_bind_graphics_pipeline(to_pvr_graphics_pipeline(pipeline), |
| cmd_buffer); |
| break; |
| |
| default: |
| unreachable("Invalid bind point."); |
| break; |
| } |
| } |
| |
| #if MESA_DEBUG |
| static void check_viewport_quirk_70165(const struct pvr_device *device, |
| const VkViewport *pViewport) |
| { |
| const struct pvr_device_info *dev_info = &device->pdevice->dev_info; |
| float min_vertex_x, max_vertex_x, min_vertex_y, max_vertex_y; |
| float min_screen_space_value, max_screen_space_value; |
| float sign_to_unsigned_offset, fixed_point_max; |
| float guardband_width, guardband_height; |
| |
| if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) { |
| /* Max representable value in 13.4 fixed point format. |
| * Round-down to avoid precision issues. |
| * Calculated as (2 ** 13) - 2*(2 ** -4) |
| */ |
| fixed_point_max = 8192.0f - 2.0f / 16.0f; |
| |
| if (PVR_HAS_FEATURE(dev_info, screen_size8K)) { |
| if (pViewport->width <= 4096 && pViewport->height <= 4096) { |
| guardband_width = pViewport->width / 4.0f; |
| guardband_height = pViewport->height / 4.0f; |
| |
| /* 2k of the range is negative */ |
| sign_to_unsigned_offset = 2048.0f; |
| } else { |
| guardband_width = 0.0f; |
| guardband_height = 0.0f; |
| |
| /* For > 4k renders, the entire range is positive */ |
| sign_to_unsigned_offset = 0.0f; |
| } |
| } else { |
| guardband_width = pViewport->width / 4.0f; |
| guardband_height = pViewport->height / 4.0f; |
| |
| /* 2k of the range is negative */ |
| sign_to_unsigned_offset = 2048.0f; |
| } |
| } else { |
| /* Max representable value in 16.8 fixed point format |
| * Calculated as (2 ** 16) - (2 ** -8) |
| */ |
| fixed_point_max = 65535.99609375f; |
| guardband_width = pViewport->width / 4.0f; |
| guardband_height = pViewport->height / 4.0f; |
| |
| /* 4k/20k of the range is negative */ |
| sign_to_unsigned_offset = (float)PVR_MAX_NEG_OFFSCREEN_OFFSET; |
| } |
| |
| min_screen_space_value = -sign_to_unsigned_offset; |
| max_screen_space_value = fixed_point_max - sign_to_unsigned_offset; |
| |
| min_vertex_x = pViewport->x - guardband_width; |
| max_vertex_x = pViewport->x + pViewport->width + guardband_width; |
| min_vertex_y = pViewport->y - guardband_height; |
| max_vertex_y = pViewport->y + pViewport->height + guardband_height; |
| if (min_vertex_x < min_screen_space_value || |
| max_vertex_x > max_screen_space_value || |
| min_vertex_y < min_screen_space_value || |
| max_vertex_y > max_screen_space_value) { |
| mesa_logw("Viewport is affected by BRN70165, geometry outside " |
| "the viewport could be corrupted"); |
| } |
| } |
| #endif |
| |
| void pvr_CmdSetViewport(VkCommandBuffer commandBuffer, |
| uint32_t firstViewport, |
| uint32_t viewportCount, |
| const VkViewport *pViewports) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| const uint32_t total_count = firstViewport + viewportCount; |
| |
| assert(firstViewport < PVR_MAX_VIEWPORTS && viewportCount > 0); |
| assert(total_count >= 1 && total_count <= PVR_MAX_VIEWPORTS); |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| #if MESA_DEBUG |
| if (PVR_HAS_QUIRK(&cmd_buffer->device->pdevice->dev_info, 70165)) { |
| for (uint32_t viewport = 0; viewport < viewportCount; viewport++) { |
| check_viewport_quirk_70165(cmd_buffer->device, &pViewports[viewport]); |
| } |
| } |
| #endif |
| |
| vk_common_CmdSetViewport(commandBuffer, |
| firstViewport, |
| viewportCount, |
| pViewports); |
| } |
| |
| void pvr_CmdSetDepthBounds(VkCommandBuffer commandBuffer, |
| float minDepthBounds, |
| float maxDepthBounds) |
| { |
| mesa_logd("No support for depth bounds testing."); |
| } |
| |
| void pvr_CmdBindDescriptorSets2KHR( |
| VkCommandBuffer commandBuffer, |
| const VkBindDescriptorSetsInfoKHR *pBindDescriptorSetsInfo) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) { |
| struct pvr_descriptor_state *desc_state = |
| &cmd_buffer->state.gfx_desc_state; |
| |
| for (unsigned u = 0; u < pBindDescriptorSetsInfo->descriptorSetCount; |
| ++u) { |
| VK_FROM_HANDLE(pvr_descriptor_set, |
| set, |
| pBindDescriptorSetsInfo->pDescriptorSets[u]); |
| unsigned desc_set = u + pBindDescriptorSetsInfo->firstSet; |
| |
| if (desc_state->sets[desc_set] != set) { |
| desc_state->sets[desc_set] = set; |
| desc_state->dirty_sets |= BITFIELD_BIT(desc_set); |
| } |
| } |
| |
| cmd_buffer->state.dirty.gfx_desc_dirty = true; |
| } |
| assert(!(pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT)); |
| } |
| |
| void pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer, |
| uint32_t firstBinding, |
| uint32_t bindingCount, |
| const VkBuffer *pBuffers, |
| const VkDeviceSize *pOffsets) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_vertex_binding *const vb = cmd_buffer->state.vertex_bindings; |
| |
| /* We have to defer setting up vertex buffer since we need the buffer |
| * stride from the pipeline. |
| */ |
| |
| assert(firstBinding < PVR_MAX_VERTEX_INPUT_BINDINGS && |
| bindingCount <= PVR_MAX_VERTEX_INPUT_BINDINGS); |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| for (uint32_t i = 0; i < bindingCount; i++) { |
| vb[firstBinding + i].buffer = pvr_buffer_from_handle(pBuffers[i]); |
| vb[firstBinding + i].offset = pOffsets[i]; |
| } |
| |
| cmd_buffer->state.dirty.vertex_bindings = true; |
| } |
| |
| void pvr_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, |
| VkBuffer buffer, |
| VkDeviceSize offset, |
| VkIndexType indexType) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| PVR_FROM_HANDLE(pvr_buffer, index_buffer, buffer); |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| |
| assert(offset < index_buffer->vk.size); |
| assert(indexType == VK_INDEX_TYPE_UINT32 || |
| indexType == VK_INDEX_TYPE_UINT16 || |
| indexType == VK_INDEX_TYPE_UINT8_KHR); |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| state->index_buffer_binding.buffer = index_buffer; |
| state->index_buffer_binding.offset = offset; |
| state->index_buffer_binding.type = indexType; |
| state->dirty.index_buffer_binding = true; |
| } |
| |
| void pvr_CmdPushConstants(VkCommandBuffer commandBuffer, |
| VkPipelineLayout layout, |
| VkShaderStageFlags stageFlags, |
| uint32_t offset, |
| uint32_t size, |
| const void *pValues) |
| { |
| #if MESA_DEBUG |
| const uint64_t ending = (uint64_t)offset + (uint64_t)size; |
| #endif |
| |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| pvr_assert(ending <= PVR_MAX_PUSH_CONSTANTS_SIZE); |
| |
| memcpy(&state->push_constants.data[offset], pValues, size); |
| |
| state->push_constants.dirty_stages |= stageFlags; |
| state->push_constants.uploaded = false; |
| } |
| |
| static VkResult |
| pvr_cmd_buffer_setup_attachments(struct pvr_cmd_buffer *cmd_buffer, |
| const struct pvr_render_pass *pass, |
| const struct pvr_framebuffer *framebuffer) |
| { |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| struct pvr_render_pass_info *info = &state->render_pass_info; |
| |
| assert(pass->attachment_count == framebuffer->attachment_count); |
| |
| /* Free any previously allocated attachments. */ |
| vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.attachments); |
| |
| if (pass->attachment_count == 0) { |
| info->attachments = NULL; |
| return VK_SUCCESS; |
| } |
| |
| info->attachments = |
| vk_zalloc(&cmd_buffer->vk.pool->alloc, |
| pass->attachment_count * sizeof(*info->attachments), |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); |
| if (!info->attachments) { |
| return vk_command_buffer_set_error(&cmd_buffer->vk, |
| VK_ERROR_OUT_OF_HOST_MEMORY); |
| } |
| |
| for (uint32_t i = 0; i < pass->attachment_count; i++) |
| info->attachments[i] = framebuffer->attachments[i]; |
| |
| return VK_SUCCESS; |
| } |
| |
| static VkResult pvr_init_render_targets(struct pvr_device *device, |
| struct pvr_render_pass *pass, |
| struct pvr_framebuffer *framebuffer) |
| { |
| for (uint32_t i = 0; i < pass->hw_setup->render_count; i++) { |
| struct pvr_render_target *render_target = |
| pvr_get_render_target(pass, framebuffer, i); |
| |
| pthread_mutex_lock(&render_target->mutex); |
| |
| if (!render_target->valid) { |
| const struct pvr_renderpass_hwsetup_render *hw_render = |
| &pass->hw_setup->renders[i]; |
| VkResult result; |
| |
| result = pvr_render_target_dataset_create(device, |
| framebuffer->width, |
| framebuffer->height, |
| hw_render->sample_count, |
| framebuffer->layers, |
| &render_target->rt_dataset); |
| if (result != VK_SUCCESS) { |
| pthread_mutex_unlock(&render_target->mutex); |
| return result; |
| } |
| |
| render_target->valid = true; |
| } |
| |
| pthread_mutex_unlock(&render_target->mutex); |
| } |
| |
| return VK_SUCCESS; |
| } |
| |
| const struct pvr_renderpass_hwsetup_subpass * |
| pvr_get_hw_subpass(const struct pvr_render_pass *pass, const uint32_t subpass) |
| { |
| const struct pvr_renderpass_hw_map *map = |
| &pass->hw_setup->subpass_map[subpass]; |
| |
| return &pass->hw_setup->renders[map->render].subpasses[map->subpass]; |
| } |
| |
| static void pvr_perform_start_of_render_attachment_clear( |
| struct pvr_cmd_buffer *cmd_buffer, |
| const struct pvr_framebuffer *framebuffer, |
| uint32_t index, |
| bool is_depth_stencil, |
| uint32_t *index_list_clear_mask) |
| { |
| ASSERTED static const VkImageAspectFlags dsc_aspect_flags = |
| VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT | |
| VK_IMAGE_ASPECT_COLOR_BIT; |
| struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info; |
| const struct pvr_render_pass *pass = info->pass; |
| const struct pvr_renderpass_hwsetup *hw_setup = pass->hw_setup; |
| const struct pvr_renderpass_hwsetup_render *hw_render = |
| &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render]; |
| VkImageAspectFlags image_aspect; |
| struct pvr_image_view *iview; |
| uint32_t view_idx; |
| |
| if (is_depth_stencil) { |
| bool stencil_clear; |
| bool depth_clear; |
| bool is_stencil; |
| bool is_depth; |
| |
| assert(hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED); |
| assert(index == 0); |
| |
| view_idx = hw_render->ds_attach_idx; |
| |
| is_depth = vk_format_has_depth(pass->attachments[view_idx].vk_format); |
| is_stencil = vk_format_has_stencil(pass->attachments[view_idx].vk_format); |
| depth_clear = hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_CLEAR; |
| stencil_clear = hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_CLEAR; |
| |
| /* Attempt to clear the ds attachment. Do not erroneously discard an |
| * attachment that has no depth clear but has a stencil attachment. |
| */ |
| /* if not (a ∧ c) ∨ (b ∧ d) */ |
| if (!((is_depth && depth_clear) || (is_stencil && stencil_clear))) |
| return; |
| } else if (hw_render->color_init[index].op != VK_ATTACHMENT_LOAD_OP_CLEAR) { |
| return; |
| } else { |
| view_idx = hw_render->color_init[index].index; |
| } |
| |
| iview = info->attachments[view_idx]; |
| |
| /* FIXME: It would be nice if this function and pvr_sub_cmd_gfx_job_init() |
| * were doing the same check (even if it's just an assert) to determine if a |
| * clear is needed. |
| */ |
| /* If this is single-layer fullscreen, we already do the clears in |
| * pvr_sub_cmd_gfx_job_init(). |
| */ |
| if (pvr_is_render_area_tile_aligned(cmd_buffer, iview) && |
| framebuffer->layers == 1) { |
| return; |
| } |
| |
| image_aspect = vk_format_aspects(pass->attachments[view_idx].vk_format); |
| assert((image_aspect & ~dsc_aspect_flags) == 0); |
| |
| if (image_aspect & VK_IMAGE_ASPECT_DEPTH_BIT && |
| hw_render->depth_init != VK_ATTACHMENT_LOAD_OP_CLEAR) { |
| image_aspect &= ~VK_IMAGE_ASPECT_DEPTH_BIT; |
| } |
| |
| if (image_aspect & VK_IMAGE_ASPECT_STENCIL_BIT && |
| hw_render->stencil_init != VK_ATTACHMENT_LOAD_OP_CLEAR) { |
| image_aspect &= ~VK_IMAGE_ASPECT_STENCIL_BIT; |
| } |
| |
| if (image_aspect != VK_IMAGE_ASPECT_NONE) { |
| VkClearAttachment clear_attachment = { |
| .aspectMask = image_aspect, |
| .colorAttachment = index, |
| .clearValue = info->clear_values[view_idx], |
| }; |
| VkClearRect rect = { |
| .rect = info->render_area, |
| .baseArrayLayer = 0, |
| .layerCount = info->framebuffer->layers, |
| }; |
| |
| assert(view_idx < info->clear_value_count); |
| |
| pvr_clear_attachments_render_init(cmd_buffer, &clear_attachment, &rect); |
| |
| *index_list_clear_mask |= (1 << index); |
| } |
| } |
| |
| static void |
| pvr_perform_start_of_render_clears(struct pvr_cmd_buffer *cmd_buffer) |
| { |
| struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info; |
| const struct pvr_framebuffer *framebuffer = info->framebuffer; |
| const struct pvr_render_pass *pass = info->pass; |
| const struct pvr_renderpass_hwsetup *hw_setup = pass->hw_setup; |
| const struct pvr_renderpass_hwsetup_render *hw_render = |
| &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render]; |
| |
| /* Mask of attachment clears using index lists instead of background object |
| * to clear. |
| */ |
| uint32_t index_list_clear_mask = 0; |
| |
| for (uint32_t i = 0; i < hw_render->color_init_count; i++) { |
| pvr_perform_start_of_render_attachment_clear(cmd_buffer, |
| framebuffer, |
| i, |
| false, |
| &index_list_clear_mask); |
| } |
| |
| info->enable_bg_tag = !!hw_render->color_init_count; |
| |
| /* If we're not using index list for all clears/loads then we need to run |
| * the background object on empty tiles. |
| */ |
| if (hw_render->color_init_count && |
| index_list_clear_mask != ((1u << hw_render->color_init_count) - 1u)) { |
| info->process_empty_tiles = true; |
| } else { |
| info->process_empty_tiles = false; |
| } |
| |
| if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) { |
| uint32_t ds_index_list = 0; |
| |
| pvr_perform_start_of_render_attachment_clear(cmd_buffer, |
| framebuffer, |
| 0, |
| true, |
| &ds_index_list); |
| } |
| |
| if (index_list_clear_mask) |
| pvr_finishme("Add support for generating loadops shaders!"); |
| } |
| |
| static void pvr_stash_depth_format(struct pvr_cmd_buffer_state *state, |
| struct pvr_sub_cmd_gfx *const sub_cmd) |
| { |
| const struct pvr_render_pass *pass = state->render_pass_info.pass; |
| const struct pvr_renderpass_hwsetup_render *hw_render = |
| &pass->hw_setup->renders[sub_cmd->hw_render_idx]; |
| |
| if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) { |
| struct pvr_image_view **iviews = state->render_pass_info.attachments; |
| |
| state->depth_format = iviews[hw_render->ds_attach_idx]->vk.format; |
| } |
| } |
| |
| static bool pvr_loadops_contain_clear(struct pvr_renderpass_hwsetup *hw_setup) |
| { |
| for (uint32_t i = 0; i < hw_setup->render_count; i++) { |
| struct pvr_renderpass_hwsetup_render *hw_render = &hw_setup->renders[i]; |
| uint32_t render_targets_count = hw_render->init_setup.num_render_targets; |
| |
| for (uint32_t j = 0; |
| j < (hw_render->color_init_count * render_targets_count); |
| j += render_targets_count) { |
| for (uint32_t k = 0; k < hw_render->init_setup.num_render_targets; |
| k++) { |
| if (hw_render->color_init[j + k].op == |
| VK_ATTACHMENT_LOAD_OP_CLEAR) { |
| return true; |
| } |
| } |
| } |
| if (hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_CLEAR || |
| hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_CLEAR) { |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| |
| static VkResult |
| pvr_cmd_buffer_set_clear_values(struct pvr_cmd_buffer *cmd_buffer, |
| const VkRenderPassBeginInfo *pRenderPassBegin) |
| { |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| |
| /* Free any previously allocated clear values. */ |
| vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.clear_values); |
| |
| if (pRenderPassBegin->clearValueCount) { |
| const size_t size = pRenderPassBegin->clearValueCount * |
| sizeof(*state->render_pass_info.clear_values); |
| |
| state->render_pass_info.clear_values = |
| vk_zalloc(&cmd_buffer->vk.pool->alloc, |
| size, |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); |
| if (!state->render_pass_info.clear_values) { |
| return vk_command_buffer_set_error(&cmd_buffer->vk, |
| VK_ERROR_OUT_OF_HOST_MEMORY); |
| } |
| |
| memcpy(state->render_pass_info.clear_values, |
| pRenderPassBegin->pClearValues, |
| size); |
| } else { |
| state->render_pass_info.clear_values = NULL; |
| } |
| |
| state->render_pass_info.clear_value_count = |
| pRenderPassBegin->clearValueCount; |
| |
| return VK_SUCCESS; |
| } |
| |
| /** |
| * \brief Indicates whether to use the large or normal clear state words. |
| * |
| * If the current render area can fit within a quarter of the max framebuffer |
| * that the device is capable of, we can use the normal clear state words, |
| * otherwise the large clear state words are needed. |
| * |
| * The requirement of a quarter of the max framebuffer comes from the index |
| * count used in the normal clear state words and the vertices uploaded at |
| * device creation. |
| * |
| * \param[in] cmd_buffer The command buffer for the clear. |
| * \return true if large clear state words are required. |
| */ |
| static bool |
| pvr_is_large_clear_required(const struct pvr_cmd_buffer *const cmd_buffer) |
| { |
| const struct pvr_device_info *const dev_info = |
| &cmd_buffer->device->pdevice->dev_info; |
| const VkRect2D render_area = cmd_buffer->state.render_pass_info.render_area; |
| const uint32_t vf_max_x = rogue_get_param_vf_max_x(dev_info); |
| const uint32_t vf_max_y = rogue_get_param_vf_max_x(dev_info); |
| |
| return (render_area.extent.width > (vf_max_x / 2) - 1) || |
| (render_area.extent.height > (vf_max_y / 2) - 1); |
| } |
| |
| static void pvr_emit_clear_words(struct pvr_cmd_buffer *const cmd_buffer, |
| struct pvr_sub_cmd_gfx *const sub_cmd) |
| { |
| struct pvr_device *device = cmd_buffer->device; |
| struct pvr_csb *csb = &sub_cmd->control_stream; |
| uint32_t vdm_state_size_in_dw; |
| const uint32_t *vdm_state; |
| uint32_t *stream; |
| |
| vdm_state_size_in_dw = |
| pvr_clear_vdm_state_get_size_in_dw(&device->pdevice->dev_info, 1); |
| |
| pvr_csb_set_relocation_mark(csb); |
| |
| stream = pvr_csb_alloc_dwords(csb, vdm_state_size_in_dw); |
| if (!stream) { |
| pvr_cmd_buffer_set_error_unwarned(cmd_buffer, csb->status); |
| return; |
| } |
| |
| if (pvr_is_large_clear_required(cmd_buffer)) |
| vdm_state = device->static_clear_state.large_clear_vdm_words; |
| else |
| vdm_state = device->static_clear_state.vdm_words; |
| |
| memcpy(stream, vdm_state, PVR_DW_TO_BYTES(vdm_state_size_in_dw)); |
| |
| pvr_csb_clear_relocation_mark(csb); |
| } |
| |
| static VkResult pvr_cs_write_load_op(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd_gfx *sub_cmd, |
| struct pvr_load_op *load_op, |
| uint32_t isp_userpass) |
| { |
| const struct pvr_device *device = cmd_buffer->device; |
| struct pvr_static_clear_ppp_template template = |
| device->static_clear_state.ppp_templates[VK_IMAGE_ASPECT_COLOR_BIT]; |
| uint32_t pds_state[PVR_STATIC_CLEAR_PDS_STATE_COUNT]; |
| struct pvr_pds_upload shareds_update_program; |
| struct pvr_suballoc_bo *pvr_bo; |
| VkResult result; |
| |
| result = pvr_load_op_data_create_and_upload(cmd_buffer, |
| load_op, |
| &shareds_update_program); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| template.config.ispctl.upass = isp_userpass; |
| |
| /* It might look odd that we aren't specifying the code segment's |
| * address anywhere. This is because the hardware always assumes that the |
| * data size is 2 128bit words and the code segments starts after that. |
| */ |
| pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SHADERBASE], |
| TA_STATE_PDS_SHADERBASE, |
| shaderbase) { |
| shaderbase.addr = PVR_DEV_ADDR(load_op->pds_frag_prog.data_offset); |
| } |
| |
| pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_TEXUNICODEBASE], |
| TA_STATE_PDS_TEXUNICODEBASE, |
| texunicodebase) { |
| texunicodebase.addr = |
| PVR_DEV_ADDR(load_op->pds_tex_state_prog.code_offset); |
| } |
| |
| pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SIZEINFO1], |
| TA_STATE_PDS_SIZEINFO1, |
| sizeinfo1) { |
| /* Dummy coefficient loading program. */ |
| sizeinfo1.pds_varyingsize = 0; |
| |
| sizeinfo1.pds_texturestatesize = DIV_ROUND_UP( |
| shareds_update_program.data_size, |
| ROGUE_TA_STATE_PDS_SIZEINFO1_PDS_TEXTURESTATESIZE_UNIT_SIZE); |
| |
| sizeinfo1.pds_tempsize = |
| DIV_ROUND_UP(load_op->temps_count, |
| ROGUE_TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE); |
| } |
| |
| pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SIZEINFO2], |
| TA_STATE_PDS_SIZEINFO2, |
| sizeinfo2) { |
| sizeinfo2.usc_sharedsize = |
| DIV_ROUND_UP(load_op->const_shareds_count, |
| ROGUE_TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE); |
| } |
| |
| /* Dummy coefficient loading program. */ |
| pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_VARYINGBASE] = 0; |
| |
| pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_TEXTUREDATABASE], |
| TA_STATE_PDS_TEXTUREDATABASE, |
| texturedatabase) { |
| texturedatabase.addr = PVR_DEV_ADDR(shareds_update_program.data_offset); |
| } |
| |
| template.config.pds_state = &pds_state; |
| |
| pvr_emit_ppp_from_template(&sub_cmd->control_stream, &template, &pvr_bo); |
| list_add(&pvr_bo->link, &cmd_buffer->bo_list); |
| |
| pvr_emit_clear_words(cmd_buffer, sub_cmd); |
| |
| pvr_reset_graphics_dirty_state(cmd_buffer, false); |
| |
| return VK_SUCCESS; |
| } |
| |
| void pvr_CmdBeginRenderPass2(VkCommandBuffer commandBuffer, |
| const VkRenderPassBeginInfo *pRenderPassBeginInfo, |
| const VkSubpassBeginInfo *pSubpassBeginInfo) |
| { |
| PVR_FROM_HANDLE(pvr_framebuffer, |
| framebuffer, |
| pRenderPassBeginInfo->framebuffer); |
| PVR_FROM_HANDLE(pvr_render_pass, pass, pRenderPassBeginInfo->renderPass); |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| const struct pvr_renderpass_hwsetup_subpass *hw_subpass; |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| assert(!state->render_pass_info.pass); |
| assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); |
| |
| /* FIXME: Create a separate function for everything using pass->subpasses, |
| * look at cmd_buffer_begin_subpass() for example. */ |
| state->render_pass_info.pass = pass; |
| state->render_pass_info.framebuffer = framebuffer; |
| state->render_pass_info.subpass_idx = 0; |
| state->render_pass_info.render_area = pRenderPassBeginInfo->renderArea; |
| state->render_pass_info.current_hw_subpass = 0; |
| state->render_pass_info.pipeline_bind_point = |
| pass->subpasses[0].pipeline_bind_point; |
| state->render_pass_info.isp_userpass = pass->subpasses[0].isp_userpass; |
| state->dirty.isp_userpass = true; |
| |
| result = pvr_cmd_buffer_setup_attachments(cmd_buffer, pass, framebuffer); |
| if (result != VK_SUCCESS) |
| return; |
| |
| result = pvr_init_render_targets(cmd_buffer->device, pass, framebuffer); |
| if (result != VK_SUCCESS) { |
| pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result); |
| return; |
| } |
| |
| result = pvr_cmd_buffer_set_clear_values(cmd_buffer, pRenderPassBeginInfo); |
| if (result != VK_SUCCESS) |
| return; |
| |
| assert(pass->subpasses[0].pipeline_bind_point == |
| VK_PIPELINE_BIND_POINT_GRAPHICS); |
| |
| result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS); |
| if (result != VK_SUCCESS) |
| return; |
| |
| /* Run subpass 0 "soft" background object after the actual background |
| * object. |
| */ |
| hw_subpass = pvr_get_hw_subpass(pass, 0); |
| if (hw_subpass->load_op) { |
| result = pvr_cs_write_load_op(cmd_buffer, |
| &cmd_buffer->state.current_sub_cmd->gfx, |
| hw_subpass->load_op, |
| 0); |
| if (result != VK_SUCCESS) |
| return; |
| } |
| |
| pvr_perform_start_of_render_clears(cmd_buffer); |
| pvr_stash_depth_format(&cmd_buffer->state, |
| &cmd_buffer->state.current_sub_cmd->gfx); |
| } |
| |
| VkResult pvr_BeginCommandBuffer(VkCommandBuffer commandBuffer, |
| const VkCommandBufferBeginInfo *pBeginInfo) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *state; |
| VkResult result; |
| |
| vk_command_buffer_begin(&cmd_buffer->vk, pBeginInfo); |
| |
| cmd_buffer->usage_flags = pBeginInfo->flags; |
| state = &cmd_buffer->state; |
| |
| /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for |
| * primary level command buffers. |
| * |
| * From the Vulkan 1.0 spec: |
| * |
| * VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a |
| * secondary command buffer is considered to be entirely inside a render |
| * pass. If this is a primary command buffer, then this bit is ignored. |
| */ |
| if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { |
| cmd_buffer->usage_flags &= |
| ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT; |
| } |
| |
| if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { |
| if (cmd_buffer->usage_flags & |
| VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { |
| const VkCommandBufferInheritanceInfo *inheritance_info = |
| pBeginInfo->pInheritanceInfo; |
| struct pvr_render_pass *pass; |
| |
| pass = pvr_render_pass_from_handle(inheritance_info->renderPass); |
| state->render_pass_info.pass = pass; |
| state->render_pass_info.framebuffer = |
| pvr_framebuffer_from_handle(inheritance_info->framebuffer); |
| state->render_pass_info.subpass_idx = inheritance_info->subpass; |
| state->render_pass_info.isp_userpass = |
| pass->subpasses[inheritance_info->subpass].isp_userpass; |
| |
| result = |
| pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| state->vis_test_enabled = inheritance_info->occlusionQueryEnable; |
| } |
| |
| state->dirty.isp_userpass = true; |
| } |
| |
| util_dynarray_init(&state->query_indices, NULL); |
| |
| memset(state->barriers_needed, |
| 0xFF, |
| sizeof(*state->barriers_needed) * ARRAY_SIZE(state->barriers_needed)); |
| |
| return VK_SUCCESS; |
| } |
| |
| VkResult pvr_cmd_buffer_add_transfer_cmd(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_transfer_cmd *transfer_cmd) |
| { |
| struct pvr_sub_cmd_transfer *sub_cmd; |
| VkResult result; |
| |
| result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| sub_cmd = &cmd_buffer->state.current_sub_cmd->transfer; |
| |
| list_addtail(&transfer_cmd->link, sub_cmd->transfer_cmds); |
| |
| return VK_SUCCESS; |
| } |
| |
| static VkResult |
| pvr_setup_vertex_buffers(struct pvr_cmd_buffer *cmd_buffer, |
| const struct pvr_graphics_pipeline *const gfx_pipeline) |
| { |
| const struct pvr_vertex_shader_state *const vertex_state = |
| &gfx_pipeline->shader_state.vertex; |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| const struct pvr_pds_info *const pds_info = state->pds_shader.info; |
| struct pvr_suballoc_bo *pvr_bo; |
| const uint8_t *entries; |
| uint32_t *dword_buffer; |
| uint64_t *qword_buffer; |
| VkResult result; |
| |
| result = |
| pvr_cmd_buffer_alloc_mem(cmd_buffer, |
| cmd_buffer->device->heaps.pds_heap, |
| PVR_DW_TO_BYTES(pds_info->data_size_in_dwords), |
| &pvr_bo); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo); |
| qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo); |
| |
| entries = (uint8_t *)pds_info->entries; |
| |
| for (uint32_t i = 0; i < pds_info->entry_count; i++) { |
| const struct pvr_const_map_entry *const entry_header = |
| (struct pvr_const_map_entry *)entries; |
| |
| switch (entry_header->type) { |
| case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: { |
| const struct pvr_const_map_entry_literal32 *const literal = |
| (struct pvr_const_map_entry_literal32 *)entries; |
| |
| PVR_WRITE(dword_buffer, |
| literal->literal_value, |
| literal->const_offset, |
| pds_info->data_size_in_dwords); |
| |
| entries += sizeof(*literal); |
| break; |
| } |
| |
| case PVR_PDS_CONST_MAP_ENTRY_TYPE_DOUTU_ADDRESS: { |
| const struct pvr_const_map_entry_doutu_address *const doutu_addr = |
| (struct pvr_const_map_entry_doutu_address *)entries; |
| |
| const pco_data *const vs_data = &state->gfx_pipeline->vs_data; |
| const pvr_dev_addr_t exec_addr = |
| PVR_DEV_ADDR_OFFSET(vertex_state->bo->dev_addr, |
| vs_data->common.entry_offset); |
| uint64_t addr = 0ULL; |
| |
| pvr_set_usc_execution_address64(&addr, exec_addr.addr); |
| |
| PVR_WRITE(qword_buffer, |
| addr | doutu_addr->doutu_control, |
| doutu_addr->const_offset, |
| pds_info->data_size_in_dwords); |
| |
| entries += sizeof(*doutu_addr); |
| break; |
| } |
| |
| case PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE: { |
| const struct pvr_const_map_entry_base_instance *const base_instance = |
| (struct pvr_const_map_entry_base_instance *)entries; |
| |
| PVR_WRITE(dword_buffer, |
| state->draw_state.base_instance, |
| base_instance->const_offset, |
| pds_info->data_size_in_dwords); |
| |
| entries += sizeof(*base_instance); |
| break; |
| } |
| |
| case PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_VERTEX: { |
| const struct pvr_const_map_entry_base_instance *const base_instance = |
| (struct pvr_const_map_entry_base_instance *)entries; |
| |
| PVR_WRITE(dword_buffer, |
| state->draw_state.base_vertex, |
| base_instance->const_offset, |
| pds_info->data_size_in_dwords); |
| |
| entries += sizeof(*base_instance); |
| break; |
| } |
| |
| case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_ADDRESS: { |
| const struct pvr_const_map_entry_vertex_attribute_address |
| *const attribute = |
| (struct pvr_const_map_entry_vertex_attribute_address *)entries; |
| const struct pvr_vertex_binding *const binding = |
| &state->vertex_bindings[attribute->binding_index]; |
| /* In relation to the Vulkan spec. 22.4. Vertex Input Address |
| * Calculation: |
| * Adding binding->offset corresponds to calculating the |
| * `bufferBindingAddress`. Adding attribute->offset corresponds to |
| * adding the `attribDesc.offset`. The `effectiveVertexOffset` is |
| * taken care by the PDS program itself with a DDMAD which will |
| * multiply the vertex/instance idx with the binding's stride and |
| * add that to the address provided here. |
| */ |
| const pvr_dev_addr_t addr = |
| PVR_DEV_ADDR_OFFSET(binding->buffer->dev_addr, |
| binding->offset + attribute->offset); |
| |
| PVR_WRITE(qword_buffer, |
| addr.addr, |
| attribute->const_offset, |
| pds_info->data_size_in_dwords); |
| |
| entries += sizeof(*attribute); |
| break; |
| } |
| |
| case PVR_PDS_CONST_MAP_ENTRY_TYPE_ROBUST_VERTEX_ATTRIBUTE_ADDRESS: { |
| const struct pvr_const_map_entry_robust_vertex_attribute_address |
| *const attribute = |
| (struct pvr_const_map_entry_robust_vertex_attribute_address *) |
| entries; |
| const struct pvr_vertex_binding *const binding = |
| &state->vertex_bindings[attribute->binding_index]; |
| pvr_dev_addr_t addr; |
| |
| if (binding->buffer->vk.size < |
| (attribute->offset + attribute->component_size_in_bytes)) { |
| /* Replace with load from robustness buffer when no attribute is in |
| * range |
| */ |
| addr = PVR_DEV_ADDR_OFFSET( |
| cmd_buffer->device->robustness_buffer->vma->dev_addr, |
| attribute->robustness_buffer_offset); |
| } else { |
| addr = PVR_DEV_ADDR_OFFSET(binding->buffer->dev_addr, |
| binding->offset + attribute->offset); |
| } |
| |
| PVR_WRITE(qword_buffer, |
| addr.addr, |
| attribute->const_offset, |
| pds_info->data_size_in_dwords); |
| |
| entries += sizeof(*attribute); |
| break; |
| } |
| |
| case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_MAX_INDEX: { |
| const struct pvr_const_map_entry_vertex_attribute_max_index *attribute = |
| (struct pvr_const_map_entry_vertex_attribute_max_index *)entries; |
| const struct pvr_vertex_binding *const binding = |
| &state->vertex_bindings[attribute->binding_index]; |
| const uint64_t bound_size = binding->buffer->vk.size - binding->offset; |
| const uint32_t attribute_end = |
| attribute->offset + attribute->component_size_in_bytes; |
| uint32_t max_index; |
| |
| if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info, |
| pds_ddmadt)) { |
| /* TODO: PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_MAX_INDEX |
| * has the same define value as |
| * PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTR_DDMADT_OOB_BUFFER_SIZE |
| * so maybe we want to remove one of the defines or change the |
| * values. |
| */ |
| pvr_finishme("Unimplemented robust buffer access with DDMADT"); |
| assert(false); |
| } |
| |
| /* If the stride is 0 then all attributes use the same single element |
| * from the binding so the index can only be up to 0. |
| */ |
| if (bound_size < attribute_end || attribute->stride == 0) { |
| max_index = 0; |
| } else { |
| max_index = (uint32_t)(bound_size / attribute->stride) - 1; |
| |
| /* There's one last attribute that can fit in. */ |
| if (bound_size % attribute->stride >= attribute_end) |
| max_index++; |
| } |
| |
| PVR_WRITE(dword_buffer, |
| max_index, |
| attribute->const_offset, |
| pds_info->data_size_in_dwords); |
| |
| entries += sizeof(*attribute); |
| break; |
| } |
| |
| default: |
| unreachable("Unsupported data section map"); |
| break; |
| } |
| } |
| |
| state->pds_vertex_attrib_offset = |
| pvr_bo->dev_addr.addr - |
| cmd_buffer->device->heaps.pds_heap->base_addr.addr; |
| |
| return VK_SUCCESS; |
| } |
| |
| static VkResult pvr_setup_descriptor_mappings( |
| struct pvr_cmd_buffer *const cmd_buffer, |
| enum pvr_stage_allocation stage, |
| const struct pvr_stage_allocation_descriptor_state *descriptor_state, |
| const pvr_dev_addr_t *const num_worgroups_buff_addr, |
| uint32_t *const descriptor_data_offset_out) |
| { |
| const struct pvr_pds_info *const pds_info = &descriptor_state->pds_info; |
| const struct pvr_descriptor_state *desc_state; |
| struct pvr_suballoc_bo *pvr_bo; |
| const uint8_t *entries; |
| uint32_t *dword_buffer; |
| uint64_t *qword_buffer; |
| VkResult result; |
| |
| if (!pds_info->data_size_in_dwords) |
| return VK_SUCCESS; |
| |
| result = |
| pvr_cmd_buffer_alloc_mem(cmd_buffer, |
| cmd_buffer->device->heaps.pds_heap, |
| PVR_DW_TO_BYTES(pds_info->data_size_in_dwords), |
| &pvr_bo); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo); |
| qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo); |
| |
| entries = (uint8_t *)pds_info->entries; |
| |
| switch (stage) { |
| case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY: |
| case PVR_STAGE_ALLOCATION_FRAGMENT: |
| desc_state = &cmd_buffer->state.gfx_desc_state; |
| break; |
| |
| case PVR_STAGE_ALLOCATION_COMPUTE: |
| desc_state = &cmd_buffer->state.compute_desc_state; |
| break; |
| |
| default: |
| unreachable("Unsupported stage."); |
| break; |
| } |
| |
| for (uint32_t i = 0; i < pds_info->entry_count; i++) { |
| const struct pvr_const_map_entry *const entry_header = |
| (struct pvr_const_map_entry *)entries; |
| |
| switch (entry_header->type) { |
| case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: { |
| const struct pvr_const_map_entry_literal32 *const literal = |
| (struct pvr_const_map_entry_literal32 *)entries; |
| |
| PVR_WRITE(dword_buffer, |
| literal->literal_value, |
| literal->const_offset, |
| pds_info->data_size_in_dwords); |
| |
| entries += sizeof(*literal); |
| break; |
| } |
| |
| case PVR_PDS_CONST_MAP_ENTRY_TYPE_DESCRIPTOR_SET: { |
| const struct pvr_const_map_entry_descriptor_set *desc_set_entry = |
| (struct pvr_const_map_entry_descriptor_set *)entries; |
| const uint32_t desc_set = desc_set_entry->descriptor_set; |
| const struct pvr_descriptor_set *descriptor_set; |
| pvr_dev_addr_t desc_set_addr; |
| |
| assert(desc_set < PVR_MAX_DESCRIPTOR_SETS); |
| |
| descriptor_set = desc_state->sets[desc_set]; |
| assert(descriptor_set); |
| |
| desc_set_addr = descriptor_set->dev_addr; |
| PVR_WRITE(qword_buffer, |
| desc_set_addr.addr, |
| desc_set_entry->const_offset, |
| pds_info->data_size_in_dwords); |
| |
| entries += sizeof(*desc_set_entry); |
| break; |
| } |
| |
| default: |
| unreachable("Unsupported map entry type."); |
| } |
| } |
| |
| *descriptor_data_offset_out = |
| pvr_bo->dev_addr.addr - |
| cmd_buffer->device->heaps.pds_heap->base_addr.addr; |
| |
| return VK_SUCCESS; |
| } |
| |
| static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd_compute *const sub_cmd) |
| { |
| const struct pvr_device *device = cmd_buffer->device; |
| const struct pvr_physical_device *pdevice = device->pdevice; |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| struct pvr_csb *csb = &sub_cmd->control_stream; |
| const struct pvr_compute_pipeline *pipeline = state->compute_pipeline; |
| const uint32_t const_shared_regs = |
| pipeline->shader_state.const_shared_reg_count; |
| struct pvr_compute_kernel_info info; |
| |
| /* No shared regs, no need to use an allocation kernel. */ |
| if (!const_shared_regs) |
| return; |
| |
| /* Accumulate the MAX number of shared registers across the kernels in this |
| * dispatch. This is used by the FW for context switching, so must be large |
| * enough to contain all the shared registers that might be in use for this |
| * compute job. Coefficients don't need to be included as the context switch |
| * will not happen within the execution of a single workgroup, thus nothing |
| * needs to be preserved. |
| */ |
| state->max_shared_regs = MAX2(state->max_shared_regs, const_shared_regs); |
| |
| info = (struct pvr_compute_kernel_info){ |
| .indirect_buffer_addr = PVR_DEV_ADDR_INVALID, |
| .sd_type = ROGUE_CDMCTRL_SD_TYPE_NONE, |
| |
| .usc_target = ROGUE_CDMCTRL_USC_TARGET_ALL, |
| .usc_common_shared = true, |
| .usc_common_size = |
| DIV_ROUND_UP(const_shared_regs, |
| ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE), |
| |
| .global_size = { 1, 1, 1 }, |
| .local_size = { 1, 1, 1 }, |
| }; |
| |
| /* Sometimes we don't have a secondary program if there were no constants to |
| * write, but we still need to run a PDS program to accomplish the |
| * allocation of the local/common store shared registers. Use the |
| * pre-uploaded empty PDS program in this instance. |
| */ |
| if (pipeline->descriptor_state.pds_info.code_size_in_dwords) { |
| uint32_t pds_data_size_in_dwords = |
| pipeline->descriptor_state.pds_info.data_size_in_dwords; |
| |
| info.pds_data_offset = state->pds_compute_descriptor_data_offset; |
| info.pds_data_size = |
| DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_data_size_in_dwords), |
| ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE); |
| |
| /* Check that we have upload the code section. */ |
| assert(pipeline->descriptor_state.pds_code.code_size); |
| info.pds_code_offset = pipeline->descriptor_state.pds_code.code_offset; |
| } else { |
| const struct pvr_pds_upload *program = &device->pds_compute_empty_program; |
| |
| info.pds_data_offset = program->data_offset; |
| info.pds_data_size = |
| DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size), |
| ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE); |
| info.pds_code_offset = program->code_offset; |
| } |
| |
| /* We don't need to pad the workgroup size. */ |
| |
| info.max_instances = |
| pvr_compute_flat_slot_size(pdevice, const_shared_regs, false, 1U); |
| |
| pvr_compute_generate_control_stream(csb, sub_cmd, &info); |
| } |
| |
| void pvr_compute_update_shared_private( |
| struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd_compute *const sub_cmd, |
| struct pvr_private_compute_pipeline *pipeline) |
| { |
| const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice; |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| const uint32_t const_shared_regs = pipeline->const_shared_regs_count; |
| struct pvr_csb *csb = &sub_cmd->control_stream; |
| struct pvr_compute_kernel_info info; |
| |
| /* No shared regs, no need to use an allocation kernel. */ |
| if (!const_shared_regs) |
| return; |
| |
| /* See comment in pvr_compute_update_shared() for details on this. */ |
| state->max_shared_regs = MAX2(state->max_shared_regs, const_shared_regs); |
| |
| info = (struct pvr_compute_kernel_info){ |
| .indirect_buffer_addr = PVR_DEV_ADDR_INVALID, |
| .usc_common_size = |
| DIV_ROUND_UP(const_shared_regs, |
| ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE), |
| .pds_data_size = |
| DIV_ROUND_UP(PVR_DW_TO_BYTES(pipeline->pds_shared_update_data_size_dw), |
| ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE), |
| .usc_target = ROGUE_CDMCTRL_USC_TARGET_ALL, |
| .pds_data_offset = pipeline->pds_shared_update_data_offset, |
| .pds_code_offset = pipeline->pds_shared_update_code_offset, |
| .sd_type = ROGUE_CDMCTRL_SD_TYPE_NONE, |
| .usc_common_shared = true, |
| .global_size = { 1, 1, 1 }, |
| .local_size = { 1, 1, 1 }, |
| }; |
| |
| /* We don't need to pad the workgroup size. */ |
| |
| info.max_instances = |
| pvr_compute_flat_slot_size(pdevice, const_shared_regs, false, 1U); |
| |
| pvr_compute_generate_control_stream(csb, sub_cmd, &info); |
| } |
| |
| static uint32_t |
| pvr_compute_flat_pad_workgroup_size(const struct pvr_physical_device *pdevice, |
| uint32_t workgroup_size, |
| uint32_t coeff_regs_count) |
| { |
| const struct pvr_device_runtime_info *dev_runtime_info = |
| &pdevice->dev_runtime_info; |
| const struct pvr_device_info *dev_info = &pdevice->dev_info; |
| uint32_t max_avail_coeff_regs = |
| dev_runtime_info->cdm_max_local_mem_size_regs; |
| uint32_t coeff_regs_count_aligned = |
| ALIGN_POT(coeff_regs_count, |
| ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE >> 2U); |
| |
| /* If the work group size is > ROGUE_MAX_INSTANCES_PER_TASK. We now *always* |
| * pad the work group size to the next multiple of |
| * ROGUE_MAX_INSTANCES_PER_TASK. |
| * |
| * If we use more than 1/8th of the max coefficient registers then we round |
| * work group size up to the next multiple of ROGUE_MAX_INSTANCES_PER_TASK |
| */ |
| /* TODO: See if this can be optimized. */ |
| if (workgroup_size > ROGUE_MAX_INSTANCES_PER_TASK || |
| coeff_regs_count_aligned > (max_avail_coeff_regs / 8)) { |
| assert(workgroup_size < rogue_get_compute_max_work_group_size(dev_info)); |
| |
| return ALIGN_POT(workgroup_size, ROGUE_MAX_INSTANCES_PER_TASK); |
| } |
| |
| return workgroup_size; |
| } |
| |
| void pvr_compute_update_kernel_private( |
| struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd_compute *const sub_cmd, |
| struct pvr_private_compute_pipeline *pipeline, |
| const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS]) |
| { |
| const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice; |
| const struct pvr_device_runtime_info *dev_runtime_info = |
| &pdevice->dev_runtime_info; |
| struct pvr_csb *csb = &sub_cmd->control_stream; |
| |
| struct pvr_compute_kernel_info info = { |
| .indirect_buffer_addr = PVR_DEV_ADDR_INVALID, |
| .usc_target = ROGUE_CDMCTRL_USC_TARGET_ANY, |
| .pds_temp_size = |
| DIV_ROUND_UP(pipeline->pds_temps_used << 2U, |
| ROGUE_CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE), |
| |
| .pds_data_size = |
| DIV_ROUND_UP(PVR_DW_TO_BYTES(pipeline->pds_data_size_dw), |
| ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE), |
| .pds_data_offset = pipeline->pds_data_offset, |
| .pds_code_offset = pipeline->pds_code_offset, |
| |
| .sd_type = ROGUE_CDMCTRL_SD_TYPE_NONE, |
| |
| .usc_unified_size = |
| DIV_ROUND_UP(pipeline->unified_store_regs_count << 2U, |
| ROGUE_CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE), |
| |
| /* clang-format off */ |
| .global_size = { |
| global_workgroup_size[0], |
| global_workgroup_size[1], |
| global_workgroup_size[2] |
| }, |
| /* clang-format on */ |
| }; |
| |
| uint32_t work_size = pipeline->workgroup_size.width * |
| pipeline->workgroup_size.height * |
| pipeline->workgroup_size.depth; |
| uint32_t coeff_regs; |
| |
| if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) { |
| /* Enforce a single workgroup per cluster through allocation starvation. |
| */ |
| coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs; |
| } else { |
| coeff_regs = pipeline->coeff_regs_count; |
| } |
| |
| info.usc_common_size = |
| DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs), |
| ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE); |
| |
| /* Use a whole slot per workgroup. */ |
| work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK); |
| |
| coeff_regs += pipeline->const_shared_regs_count; |
| |
| if (pipeline->const_shared_regs_count > 0) |
| info.sd_type = ROGUE_CDMCTRL_SD_TYPE_USC; |
| |
| work_size = |
| pvr_compute_flat_pad_workgroup_size(pdevice, work_size, coeff_regs); |
| |
| info.local_size[0] = work_size; |
| info.local_size[1] = 1U; |
| info.local_size[2] = 1U; |
| |
| info.max_instances = |
| pvr_compute_flat_slot_size(pdevice, coeff_regs, false, work_size); |
| |
| pvr_compute_generate_control_stream(csb, sub_cmd, &info); |
| } |
| |
| /* TODO: Wire up the base_workgroup variant program when implementing |
| * VK_KHR_device_group. The values will also need patching into the program. |
| */ |
| static void pvr_compute_update_kernel( |
| struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd_compute *const sub_cmd, |
| pvr_dev_addr_t indirect_addr, |
| const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS]) |
| { |
| const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice; |
| const struct pvr_device_runtime_info *dev_runtime_info = |
| &pdevice->dev_runtime_info; |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| struct pvr_csb *csb = &sub_cmd->control_stream; |
| const struct pvr_compute_pipeline *pipeline = state->compute_pipeline; |
| const struct pvr_compute_shader_state *shader_state = |
| &pipeline->shader_state; |
| const struct pvr_pds_info *program_info = &pipeline->primary_program_info; |
| |
| struct pvr_compute_kernel_info info = { |
| .indirect_buffer_addr = indirect_addr, |
| .usc_target = ROGUE_CDMCTRL_USC_TARGET_ANY, |
| .pds_temp_size = |
| DIV_ROUND_UP(program_info->temps_required << 2U, |
| ROGUE_CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE), |
| |
| .pds_data_size = |
| DIV_ROUND_UP(PVR_DW_TO_BYTES(program_info->data_size_in_dwords), |
| ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE), |
| .pds_data_offset = pipeline->primary_program.data_offset, |
| .pds_code_offset = pipeline->primary_program.code_offset, |
| |
| .sd_type = ROGUE_CDMCTRL_SD_TYPE_NONE, |
| |
| .usc_unified_size = |
| DIV_ROUND_UP(shader_state->input_register_count << 2U, |
| ROGUE_CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE), |
| |
| /* clang-format off */ |
| .global_size = { |
| global_workgroup_size[0], |
| global_workgroup_size[1], |
| global_workgroup_size[2] |
| }, |
| /* clang-format on */ |
| }; |
| |
| uint32_t work_size = shader_state->work_size; |
| uint32_t coeff_regs; |
| |
| if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) { |
| /* Enforce a single workgroup per cluster through allocation starvation. |
| */ |
| coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs; |
| } else { |
| coeff_regs = shader_state->coefficient_register_count; |
| } |
| |
| info.usc_common_size = |
| DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs), |
| ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE); |
| |
| /* Use a whole slot per workgroup. */ |
| work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK); |
| |
| coeff_regs += shader_state->const_shared_reg_count; |
| |
| if (shader_state->const_shared_reg_count > 0) |
| info.sd_type = ROGUE_CDMCTRL_SD_TYPE_USC; |
| |
| work_size = |
| pvr_compute_flat_pad_workgroup_size(pdevice, work_size, coeff_regs); |
| |
| info.local_size[0] = work_size; |
| info.local_size[1] = 1U; |
| info.local_size[2] = 1U; |
| |
| info.max_instances = |
| pvr_compute_flat_slot_size(pdevice, coeff_regs, false, work_size); |
| |
| pvr_compute_generate_control_stream(csb, sub_cmd, &info); |
| } |
| |
| static VkResult pvr_cmd_upload_push_consts(struct pvr_cmd_buffer *cmd_buffer) |
| { |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| struct pvr_suballoc_bo *suballoc_bo; |
| VkResult result; |
| |
| /* TODO: Here are some possible optimizations/things to consider: |
| * |
| * - Currently we upload maxPushConstantsSize. The application might only |
| * be using a portion of that so we might end up with unused memory. |
| * Should we be smarter about this. If we intend to upload the push |
| * consts into shareds, we definitely want to do avoid reserving unused |
| * regs. |
| * |
| * - For now we have to upload to a new buffer each time since the shaders |
| * access the push constants from memory. If we were to reuse the same |
| * buffer we might update the contents out of sync with job submission |
| * and the shaders will see the updated contents while the command |
| * buffer was still being recorded and not yet submitted. |
| * If we were to upload the push constants directly to shared regs we |
| * could reuse the same buffer (avoiding extra allocation overhead) |
| * since the contents will be DMAed only on job submission when the |
| * control stream is processed and the PDS program is executed. This |
| * approach would also allow us to avoid regenerating the PDS data |
| * section in some cases since the buffer address will be constants. |
| */ |
| |
| if (cmd_buffer->state.push_constants.uploaded) |
| return VK_SUCCESS; |
| |
| result = pvr_cmd_buffer_upload_general(cmd_buffer, |
| state->push_constants.data, |
| sizeof(state->push_constants.data), |
| &suballoc_bo); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| cmd_buffer->state.push_constants.dev_addr = suballoc_bo->dev_addr; |
| cmd_buffer->state.push_constants.uploaded = true; |
| |
| return VK_SUCCESS; |
| } |
| |
| static void pvr_cmd_dispatch( |
| struct pvr_cmd_buffer *const cmd_buffer, |
| const pvr_dev_addr_t indirect_addr, |
| const uint32_t workgroup_size[static const PVR_WORKGROUP_DIMENSIONS]) |
| { |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| const struct pvr_compute_pipeline *compute_pipeline = |
| state->compute_pipeline; |
| struct pvr_sub_cmd_compute *sub_cmd; |
| VkResult result; |
| |
| pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_COMPUTE); |
| |
| sub_cmd = &state->current_sub_cmd->compute; |
| sub_cmd->uses_atomic_ops |= compute_pipeline->shader_state.uses_atomic_ops; |
| sub_cmd->uses_barrier |= compute_pipeline->shader_state.uses_barrier; |
| |
| if (state->push_constants.dirty_stages & VK_SHADER_STAGE_COMPUTE_BIT) { |
| result = pvr_cmd_upload_push_consts(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return; |
| |
| /* Regenerate the PDS program to use the new push consts buffer. */ |
| state->dirty.compute_desc_dirty = true; |
| |
| state->push_constants.dirty_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT; |
| } |
| |
| unreachable("compute descriptor support"); |
| |
| pvr_compute_update_shared(cmd_buffer, sub_cmd); |
| pvr_compute_update_kernel(cmd_buffer, sub_cmd, indirect_addr, workgroup_size); |
| } |
| |
| void pvr_CmdDispatch(VkCommandBuffer commandBuffer, |
| uint32_t groupCountX, |
| uint32_t groupCountY, |
| uint32_t groupCountZ) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| if (!groupCountX || !groupCountY || !groupCountZ) |
| return; |
| |
| pvr_cmd_dispatch(cmd_buffer, |
| PVR_DEV_ADDR_INVALID, |
| (uint32_t[]){ groupCountX, groupCountY, groupCountZ }); |
| } |
| |
| void pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer, |
| VkBuffer _buffer, |
| VkDeviceSize offset) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer); |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| pvr_cmd_dispatch(cmd_buffer, |
| PVR_DEV_ADDR_OFFSET(buffer->dev_addr, offset), |
| (uint32_t[]){ 1, 1, 1 }); |
| } |
| |
| static void |
| pvr_update_draw_state(struct pvr_cmd_buffer_state *const state, |
| const struct pvr_cmd_buffer_draw_state *const draw_state) |
| { |
| /* We don't have a state to tell us that base_instance is being used so it |
| * gets used as a boolean - 0 means we'll use a pds program that skips the |
| * base instance addition. If the base_instance gets used (and the last |
| * draw's base_instance was 0) then we switch to the BASE_INSTANCE attrib |
| * program. |
| * |
| * If base_instance changes then we only need to update the data section. |
| * |
| * The only draw call state that doesn't really matter is the start vertex |
| * as that is handled properly in the VDM state in all cases. |
| */ |
| if ((state->draw_state.draw_indexed != draw_state->draw_indexed) || |
| (state->draw_state.draw_indirect != draw_state->draw_indirect) || |
| (state->draw_state.base_instance == 0 && |
| draw_state->base_instance != 0)) { |
| state->dirty.draw_variant = true; |
| } else if (state->draw_state.base_instance != draw_state->base_instance) { |
| state->dirty.draw_base_instance = true; |
| } |
| |
| state->draw_state = *draw_state; |
| } |
| |
| static uint32_t pvr_calc_shared_regs_count( |
| const struct pvr_graphics_pipeline *const gfx_pipeline) |
| { |
| uint32_t shared_regs = gfx_pipeline->vs_data.common.shareds; |
| |
| if (gfx_pipeline->shader_state.fragment.bo) { |
| uint32_t fragment_regs = gfx_pipeline->fs_data.common.shareds; |
| shared_regs = MAX2(shared_regs, fragment_regs); |
| } |
| |
| return shared_regs; |
| } |
| |
| static void |
| pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer *const cmd_buffer, |
| struct pvr_sub_cmd_gfx *const sub_cmd, |
| const uint32_t pds_vertex_descriptor_data_offset) |
| { |
| const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| const struct pvr_stage_allocation_descriptor_state |
| *const vertex_descriptor_state = |
| &state->gfx_pipeline->shader_state.vertex.descriptor_state; |
| const pco_data *const vs_data = &state->gfx_pipeline->vs_data; |
| struct pvr_csb *const csb = &sub_cmd->control_stream; |
| |
| if (!vertex_descriptor_state->pds_info.code_size_in_dwords) |
| return; |
| |
| pvr_csb_set_relocation_mark(csb); |
| |
| pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) { |
| state0.usc_target = ROGUE_VDMCTRL_USC_TARGET_ALL; |
| |
| state0.usc_common_size = |
| DIV_ROUND_UP(vs_data->common.shareds, |
| ROGUE_VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE); |
| |
| state0.pds_data_size = DIV_ROUND_UP( |
| PVR_DW_TO_BYTES(vertex_descriptor_state->pds_info.data_size_in_dwords), |
| ROGUE_VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE); |
| } |
| |
| pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) { |
| state1.pds_data_addr = PVR_DEV_ADDR(pds_vertex_descriptor_data_offset); |
| state1.sd_type = ROGUE_VDMCTRL_SD_TYPE_NONE; |
| } |
| |
| pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) { |
| state2.pds_code_addr = |
| PVR_DEV_ADDR(vertex_descriptor_state->pds_code.code_offset); |
| } |
| |
| pvr_csb_clear_relocation_mark(csb); |
| } |
| |
| static void pvr_setup_output_select(struct pvr_cmd_buffer *const cmd_buffer) |
| { |
| const struct pvr_graphics_pipeline *const gfx_pipeline = |
| cmd_buffer->state.gfx_pipeline; |
| struct ROGUE_TA_STATE_HEADER *const header = &cmd_buffer->state.emit_header; |
| struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state; |
| const pco_data *const vs_data = &gfx_pipeline->vs_data; |
| const pco_data *const fs_data = &gfx_pipeline->fs_data; |
| uint32_t output_selects; |
| uint32_t varying[2]; |
| |
| const pco_range *varyings = vs_data->vs.varyings; |
| |
| const bool has_point_size = varyings[VARYING_SLOT_PSIZ].count > 0; |
| |
| const bool has_viewport = varyings[VARYING_SLOT_VIEWPORT].count > 0; |
| |
| const bool has_layer = varyings[VARYING_SLOT_LAYER].count > 0; |
| |
| pvr_csb_pack (&output_selects, TA_OUTPUT_SEL, state) { |
| state.rhw_pres = fs_data->fs.uses.w; |
| state.tsp_unclamped_z_pres = fs_data->fs.uses.z; |
| |
| state.vtxsize = vs_data->vs.vtxouts; |
| state.psprite_size_pres = has_point_size; |
| state.vpt_tgt_pres = has_viewport; |
| state.render_tgt_pres = has_layer; |
| } |
| |
| if (ppp_state->output_selects != output_selects) { |
| ppp_state->output_selects = output_selects; |
| header->pres_outselects = true; |
| } |
| |
| pvr_csb_pack (&varying[0], TA_STATE_VARYING0, varying0) { |
| varying0.f32_linear = vs_data->vs.f32_smooth; |
| varying0.f32_flat = vs_data->vs.f32_flat; |
| varying0.f32_npc = vs_data->vs.f32_npc; |
| } |
| |
| if (ppp_state->varying_word[0] != varying[0]) { |
| ppp_state->varying_word[0] = varying[0]; |
| header->pres_varying_word0 = true; |
| } |
| |
| pvr_csb_pack (&varying[1], TA_STATE_VARYING1, varying1) { |
| varying1.f16_linear = vs_data->vs.f16_smooth; |
| varying1.f16_flat = vs_data->vs.f16_flat; |
| varying1.f16_npc = vs_data->vs.f16_npc; |
| } |
| |
| if (ppp_state->varying_word[1] != varying[1]) { |
| ppp_state->varying_word[1] = varying[1]; |
| header->pres_varying_word1 = true; |
| } |
| } |
| |
| static void |
| pvr_setup_isp_faces_and_control(struct pvr_cmd_buffer *const cmd_buffer, |
| struct ROGUE_TA_STATE_ISPA *const ispa_out) |
| { |
| struct ROGUE_TA_STATE_HEADER *const header = &cmd_buffer->state.emit_header; |
| const struct pvr_fragment_shader_state *const fragment_shader_state = |
| &cmd_buffer->state.gfx_pipeline->shader_state.fragment; |
| const struct pvr_render_pass_info *const pass_info = |
| &cmd_buffer->state.render_pass_info; |
| struct vk_dynamic_graphics_state *dynamic_state = |
| &cmd_buffer->vk.dynamic_graphics_state; |
| struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state; |
| |
| const bool rasterizer_discard = dynamic_state->rs.rasterizer_discard_enable; |
| const uint32_t subpass_idx = pass_info->subpass_idx; |
| const uint32_t depth_stencil_attachment_idx = |
| pass_info->pass->subpasses[subpass_idx].depth_stencil_attachment; |
| const struct pvr_render_pass_attachment *const attachment = |
| depth_stencil_attachment_idx != VK_ATTACHMENT_UNUSED |
| ? &pass_info->pass->attachments[depth_stencil_attachment_idx] |
| : NULL; |
| |
| const enum ROGUE_TA_OBJTYPE obj_type = |
| pvr_ta_objtype(dynamic_state->ia.primitive_topology); |
| |
| const VkImageAspectFlags ds_aspects = |
| (!rasterizer_discard && attachment) |
| ? vk_format_aspects(attachment->vk_format) & |
| (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT) |
| : VK_IMAGE_ASPECT_NONE; |
| |
| /* This is deliberately a full copy rather than a pointer because |
| * vk_optimize_depth_stencil_state() can only be run once against any given |
| * instance of vk_depth_stencil_state. |
| */ |
| struct vk_depth_stencil_state ds_state = dynamic_state->ds; |
| |
| uint32_t ispb_stencil_off; |
| bool is_two_sided = false; |
| uint32_t isp_control; |
| |
| uint32_t line_width; |
| uint32_t common_a; |
| uint32_t front_a; |
| uint32_t front_b; |
| uint32_t back_a; |
| uint32_t back_b; |
| |
| vk_optimize_depth_stencil_state(&ds_state, ds_aspects, true); |
| |
| /* Convert to 4.4 fixed point format. */ |
| line_width = util_unsigned_fixed(dynamic_state->rs.line.width, 4); |
| |
| /* Subtract 1 to shift values from range [0=0,256=16] to [0=1/16,255=16]. |
| * If 0 it stays at 0, otherwise we subtract 1. |
| */ |
| line_width = (!!line_width) * (line_width - 1); |
| |
| line_width = MIN2(line_width, ROGUE_TA_STATE_ISPA_POINTLINEWIDTH_SIZE_MAX); |
| |
| /* TODO: Part of the logic in this function is duplicated in another part |
| * of the code. E.g. the dcmpmode, and sop1/2/3. Could we do this earlier? |
| */ |
| |
| pvr_csb_pack (&common_a, TA_STATE_ISPA, ispa) { |
| ispa.pointlinewidth = line_width; |
| |
| ispa.dcmpmode = pvr_ta_cmpmode(ds_state.depth.compare_op); |
| ispa.dwritedisable = !ds_state.depth.write_enable; |
| |
| ispa.passtype = fragment_shader_state->pass_type; |
| |
| ispa.objtype = obj_type; |
| |
| /* Return unpacked ispa structure. dcmpmode, dwritedisable, passtype and |
| * objtype are needed by pvr_setup_triangle_merging_flag. |
| */ |
| if (ispa_out) |
| *ispa_out = ispa; |
| } |
| |
| /* TODO: Does this actually represent the ispb control word on stencil off? |
| * If not, rename the variable. |
| */ |
| pvr_csb_pack (&ispb_stencil_off, TA_STATE_ISPB, ispb) { |
| ispb.sop3 = ROGUE_TA_ISPB_STENCILOP_KEEP; |
| ispb.sop2 = ROGUE_TA_ISPB_STENCILOP_KEEP; |
| ispb.sop1 = ROGUE_TA_ISPB_STENCILOP_KEEP; |
| ispb.scmpmode = ROGUE_TA_CMPMODE_ALWAYS; |
| } |
| |
| /* FIXME: This logic should be redone and improved. Can we also get rid of |
| * the front and back variants? |
| */ |
| |
| front_a = common_a; |
| back_a = common_a; |
| |
| if (ds_state.stencil.test_enable) { |
| uint32_t front_a_sref; |
| uint32_t back_a_sref; |
| |
| pvr_csb_pack (&front_a_sref, TA_STATE_ISPA, ispa) { |
| ispa.sref = ds_state.stencil.front.reference; |
| } |
| front_a |= front_a_sref; |
| |
| pvr_csb_pack (&back_a_sref, TA_STATE_ISPA, ispa) { |
| ispa.sref = ds_state.stencil.back.reference; |
| } |
| back_a |= back_a_sref; |
| |
| pvr_csb_pack (&front_b, TA_STATE_ISPB, ispb) { |
| const struct vk_stencil_test_face_state *const front = |
| &ds_state.stencil.front; |
| |
| if (ds_state.stencil.write_enable) |
| ispb.swmask = front->write_mask; |
| |
| ispb.scmpmask = front->compare_mask; |
| |
| ispb.sop3 = pvr_ta_stencilop(front->op.pass); |
| ispb.sop2 = pvr_ta_stencilop(front->op.depth_fail); |
| ispb.sop1 = pvr_ta_stencilop(front->op.fail); |
| ispb.scmpmode = pvr_ta_cmpmode(front->op.compare); |
| } |
| |
| pvr_csb_pack (&back_b, TA_STATE_ISPB, ispb) { |
| const struct vk_stencil_test_face_state *const back = |
| &ds_state.stencil.back; |
| |
| if (ds_state.stencil.write_enable) |
| ispb.swmask = back->write_mask; |
| |
| ispb.scmpmask = back->compare_mask; |
| |
| ispb.sop3 = pvr_ta_stencilop(back->op.pass); |
| ispb.sop2 = pvr_ta_stencilop(back->op.depth_fail); |
| ispb.sop1 = pvr_ta_stencilop(back->op.fail); |
| ispb.scmpmode = pvr_ta_cmpmode(back->op.compare); |
| } |
| } else { |
| front_b = ispb_stencil_off; |
| back_b = ispb_stencil_off; |
| } |
| |
| if (front_a != back_a || front_b != back_b) { |
| if (dynamic_state->rs.cull_mode & VK_CULL_MODE_BACK_BIT) { |
| /* Single face, using front state. */ |
| } else if (dynamic_state->rs.cull_mode & VK_CULL_MODE_FRONT_BIT) { |
| /* Single face, using back state. */ |
| |
| front_a = back_a; |
| front_b = back_b; |
| } else { |
| /* Both faces. */ |
| |
| header->pres_ispctl_ba = is_two_sided = true; |
| |
| if (dynamic_state->rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE) { |
| uint32_t tmp = front_a; |
| |
| front_a = back_a; |
| back_a = tmp; |
| |
| tmp = front_b; |
| front_b = back_b; |
| back_b = tmp; |
| } |
| |
| /* HW defaults to stencil off. */ |
| if (back_b != ispb_stencil_off) { |
| header->pres_ispctl_fb = true; |
| header->pres_ispctl_bb = true; |
| } |
| } |
| } |
| |
| if (ds_state.stencil.test_enable && front_b != ispb_stencil_off) |
| header->pres_ispctl_fb = true; |
| |
| pvr_csb_pack (&isp_control, TA_STATE_ISPCTL, ispctl) { |
| ispctl.upass = pass_info->isp_userpass; |
| |
| /* TODO: is bo ever NULL? Figure out what to do. */ |
| ispctl.tagwritedisable = rasterizer_discard || !fragment_shader_state->bo; |
| |
| ispctl.two_sided = is_two_sided; |
| ispctl.bpres = header->pres_ispctl_fb || header->pres_ispctl_bb; |
| |
| ispctl.dbenable = !rasterizer_discard && |
| dynamic_state->rs.depth_bias.enable && |
| obj_type == ROGUE_TA_OBJTYPE_TRIANGLE; |
| if (!rasterizer_discard && cmd_buffer->state.vis_test_enabled) { |
| ispctl.vistest = true; |
| ispctl.visreg = cmd_buffer->state.vis_reg; |
| } |
| |
| ispctl.scenable = !rasterizer_discard; |
| |
| ppp_state->isp.control_struct = ispctl; |
| } |
| |
| header->pres_ispctl = true; |
| |
| ppp_state->isp.control = isp_control; |
| ppp_state->isp.front_a = front_a; |
| ppp_state->isp.front_b = front_b; |
| ppp_state->isp.back_a = back_a; |
| ppp_state->isp.back_b = back_b; |
| } |
| |
| static float |
| pvr_calculate_final_depth_bias_contant_factor(struct pvr_device_info *dev_info, |
| VkFormat format, |
| float depth_bias) |
| { |
| /* Information for future modifiers of these depth bias calculations. |
| * ================================================================== |
| * Specified depth bias equations scale the specified constant factor by a |
| * value 'r' that is guaranteed to cause a resolvable difference in depth |
| * across the entire range of depth values. |
| * For floating point depth formats 'r' is calculated by taking the maximum |
| * exponent across the triangle. |
| * For UNORM formats 'r' is constant. |
| * Here 'n' is the number of mantissa bits stored in the floating point |
| * representation (23 for F32). |
| * |
| * UNORM Format -> z += dbcf * r + slope |
| * FLOAT Format -> z += dbcf * 2^(e-n) + slope |
| * |
| * HW Variations. |
| * ============== |
| * The HW either always performs the F32 depth bias equation (exponent based |
| * r), or in the case of HW that correctly supports the integer depth bias |
| * equation for UNORM depth formats, we can select between both equations |
| * using the ROGUE_CR_ISP_CTL.dbias_is_int flag - this is required to |
| * correctly perform Vulkan UNORM depth bias (constant r). |
| * |
| * if ern42307: |
| * if DBIAS_IS_INT_EN: |
| * z += dbcf + slope |
| * else: |
| * z += dbcf * 2^(e-n) + slope |
| * else: |
| * z += dbcf * 2^(e-n) + slope |
| * |
| */ |
| |
| float nudge_factor; |
| |
| if (PVR_HAS_ERN(dev_info, 42307)) { |
| switch (format) { |
| case VK_FORMAT_D16_UNORM: |
| return depth_bias / (1 << 15); |
| |
| case VK_FORMAT_D24_UNORM_S8_UINT: |
| case VK_FORMAT_X8_D24_UNORM_PACK32: |
| return depth_bias / (1 << 23); |
| |
| default: |
| return depth_bias; |
| } |
| } |
| |
| /* The reasoning behind clamping/nudging the value here is because UNORM |
| * depth formats can have higher precision over our underlying D32F |
| * representation for some depth ranges. |
| * |
| * When the HW scales the depth bias value by 2^(e-n) [The 'r' term'] a depth |
| * bias of 1 can result in a value smaller than one F32 ULP, which will get |
| * quantized to 0 - resulting in no bias. |
| * |
| * Biasing small values away from zero will ensure that small depth biases of |
| * 1 still yield a result and overcome Z-fighting. |
| */ |
| switch (format) { |
| case VK_FORMAT_D16_UNORM: |
| depth_bias *= 512.0f; |
| nudge_factor = 1.0f; |
| break; |
| |
| case VK_FORMAT_D24_UNORM_S8_UINT: |
| case VK_FORMAT_X8_D24_UNORM_PACK32: |
| depth_bias *= 2.0f; |
| nudge_factor = 2.0f; |
| break; |
| |
| default: |
| nudge_factor = 0.0f; |
| break; |
| } |
| |
| if (nudge_factor != 0.0f) { |
| if (depth_bias < 0.0f && depth_bias > -nudge_factor) |
| depth_bias -= nudge_factor; |
| else if (depth_bias > 0.0f && depth_bias < nudge_factor) |
| depth_bias += nudge_factor; |
| } |
| |
| return depth_bias; |
| } |
| |
| static void pvr_get_viewport_scissor_overlap(const VkViewport *const viewport, |
| const VkRect2D *const scissor, |
| VkRect2D *const rect_out) |
| { |
| /* TODO: See if we can remove this struct. */ |
| struct pvr_rect { |
| int32_t x0, y0; |
| int32_t x1, y1; |
| }; |
| |
| /* TODO: Worry about overflow? */ |
| const struct pvr_rect scissor_rect = { |
| .x0 = scissor->offset.x, |
| .y0 = scissor->offset.y, |
| .x1 = scissor->offset.x + scissor->extent.width, |
| .y1 = scissor->offset.y + scissor->extent.height |
| }; |
| struct pvr_rect viewport_rect = { 0 }; |
| |
| assert(viewport->width >= 0.0f); |
| assert(scissor_rect.x0 >= 0); |
| assert(scissor_rect.y0 >= 0); |
| |
| if (scissor->extent.width == 0 || scissor->extent.height == 0) { |
| *rect_out = (VkRect2D){ 0 }; |
| return; |
| } |
| |
| viewport_rect.x0 = (int32_t)viewport->x; |
| viewport_rect.x1 = (int32_t)viewport->x + (int32_t)viewport->width; |
| |
| /* TODO: Is there a mathematical way of doing all this and then clamp at |
| * the end? |
| */ |
| /* We flip the y0 and y1 when height is negative. */ |
| viewport_rect.y0 = (int32_t)viewport->y + MIN2(0, (int32_t)viewport->height); |
| viewport_rect.y1 = (int32_t)viewport->y + MAX2(0, (int32_t)viewport->height); |
| |
| if (scissor_rect.x1 <= viewport_rect.x0 || |
| scissor_rect.y1 <= viewport_rect.y0 || |
| scissor_rect.x0 >= viewport_rect.x1 || |
| scissor_rect.y0 >= viewport_rect.y1) { |
| *rect_out = (VkRect2D){ 0 }; |
| return; |
| } |
| |
| /* Determine the overlapping rectangle. */ |
| viewport_rect.x0 = MAX2(viewport_rect.x0, scissor_rect.x0); |
| viewport_rect.y0 = MAX2(viewport_rect.y0, scissor_rect.y0); |
| viewport_rect.x1 = MIN2(viewport_rect.x1, scissor_rect.x1); |
| viewport_rect.y1 = MIN2(viewport_rect.y1, scissor_rect.y1); |
| |
| /* TODO: Is this conversion safe? Is this logic right? */ |
| rect_out->offset.x = (uint32_t)viewport_rect.x0; |
| rect_out->offset.y = (uint32_t)viewport_rect.y0; |
| rect_out->extent.height = (uint32_t)(viewport_rect.y1 - viewport_rect.y0); |
| rect_out->extent.width = (uint32_t)(viewport_rect.x1 - viewport_rect.x0); |
| } |
| |
| static inline uint32_t |
| pvr_get_geom_region_clip_align_size(struct pvr_device_info *const dev_info) |
| { |
| /* TODO: This should come from rogue_ppp.xml. */ |
| return 16U + 16U * (!PVR_HAS_FEATURE(dev_info, tile_size_16x16)); |
| } |
| |
| static void |
| pvr_setup_isp_depth_bias_scissor_state(struct pvr_cmd_buffer *const cmd_buffer) |
| { |
| struct ROGUE_TA_STATE_HEADER *const header = &cmd_buffer->state.emit_header; |
| struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state; |
| struct vk_dynamic_graphics_state *const dynamic_state = |
| &cmd_buffer->vk.dynamic_graphics_state; |
| const struct ROGUE_TA_STATE_ISPCTL *const ispctl = |
| &ppp_state->isp.control_struct; |
| struct pvr_device_info *const dev_info = |
| &cmd_buffer->device->pdevice->dev_info; |
| |
| if (ispctl->dbenable && |
| (BITSET_TEST(dynamic_state->dirty, |
| MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) || |
| cmd_buffer->depth_bias_array.size == 0)) { |
| struct pvr_depth_bias_state depth_bias = { |
| .constant_factor = pvr_calculate_final_depth_bias_contant_factor( |
| dev_info, |
| cmd_buffer->state.depth_format, |
| dynamic_state->rs.depth_bias.constant_factor), |
| .slope_factor = dynamic_state->rs.depth_bias.slope_factor, |
| .clamp = dynamic_state->rs.depth_bias.clamp, |
| }; |
| |
| ppp_state->depthbias_scissor_indices.depthbias_index = |
| util_dynarray_num_elements(&cmd_buffer->depth_bias_array, |
| __typeof__(depth_bias)); |
| |
| util_dynarray_append(&cmd_buffer->depth_bias_array, |
| __typeof__(depth_bias), |
| depth_bias); |
| |
| header->pres_ispctl_dbsc = true; |
| } |
| |
| if (ispctl->scenable) { |
| const uint32_t region_clip_align_size = |
| pvr_get_geom_region_clip_align_size(dev_info); |
| const VkViewport *const viewport = &dynamic_state->vp.viewports[0]; |
| const VkRect2D *const scissor = &dynamic_state->vp.scissors[0]; |
| struct pvr_scissor_words scissor_words; |
| VkRect2D overlap_rect; |
| uint32_t height; |
| uint32_t width; |
| uint32_t x; |
| uint32_t y; |
| |
| /* For region clip. */ |
| uint32_t bottom; |
| uint32_t right; |
| uint32_t left; |
| uint32_t top; |
| |
| /* We don't support multiple viewport calculations. */ |
| assert(dynamic_state->vp.viewport_count == 1); |
| /* We don't support multiple scissor calculations. */ |
| assert(dynamic_state->vp.scissor_count == 1); |
| |
| pvr_get_viewport_scissor_overlap(viewport, scissor, &overlap_rect); |
| |
| x = overlap_rect.offset.x; |
| y = overlap_rect.offset.y; |
| width = overlap_rect.extent.width; |
| height = overlap_rect.extent.height; |
| |
| pvr_csb_pack (&scissor_words.w0, IPF_SCISSOR_WORD_0, word0) { |
| word0.scw0_xmax = x + width; |
| word0.scw0_xmin = x; |
| } |
| |
| pvr_csb_pack (&scissor_words.w1, IPF_SCISSOR_WORD_1, word1) { |
| word1.scw1_ymax = y + height; |
| word1.scw1_ymin = y; |
| } |
| |
| if (cmd_buffer->scissor_array.size && |
| cmd_buffer->scissor_words.w0 == scissor_words.w0 && |
| cmd_buffer->scissor_words.w1 == scissor_words.w1) { |
| return; |
| } |
| |
| cmd_buffer->scissor_words = scissor_words; |
| |
| /* Calculate region clip. */ |
| |
| left = x / region_clip_align_size; |
| top = y / region_clip_align_size; |
| |
| /* We prevent right=-1 with the multiplication. */ |
| /* TODO: Is there a better way of doing this? */ |
| if ((x + width) != 0U) |
| right = DIV_ROUND_UP(x + width, region_clip_align_size) - 1; |
| else |
| right = 0; |
| |
| if ((y + height) != 0U) |
| bottom = DIV_ROUND_UP(y + height, region_clip_align_size) - 1; |
| else |
| bottom = 0U; |
| |
| /* Setup region clip to clip everything outside what was calculated. */ |
| |
| /* FIXME: Should we mask to prevent writing over other words? */ |
| pvr_csb_pack (&ppp_state->region_clipping.word0, TA_REGION_CLIP0, word0) { |
| word0.right = right; |
| word0.left = left; |
| word0.mode = ROGUE_TA_REGION_CLIP_MODE_OUTSIDE; |
| } |
| |
| pvr_csb_pack (&ppp_state->region_clipping.word1, TA_REGION_CLIP1, word1) { |
| word1.bottom = bottom; |
| word1.top = top; |
| } |
| |
| ppp_state->depthbias_scissor_indices.scissor_index = |
| util_dynarray_num_elements(&cmd_buffer->scissor_array, |
| struct pvr_scissor_words); |
| |
| util_dynarray_append(&cmd_buffer->scissor_array, |
| struct pvr_scissor_words, |
| cmd_buffer->scissor_words); |
| |
| header->pres_ispctl_dbsc = true; |
| header->pres_region_clip = true; |
| } |
| } |
| |
| static void |
| pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer *const cmd_buffer, |
| struct ROGUE_TA_STATE_ISPA *ispa) |
| { |
| struct ROGUE_TA_STATE_HEADER *const header = &cmd_buffer->state.emit_header; |
| struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state; |
| uint32_t merge_word; |
| uint32_t mask; |
| |
| pvr_csb_pack (&merge_word, TA_STATE_PDS_SIZEINFO2, size_info) { |
| /* Disable for lines or punch-through or for DWD and depth compare |
| * always. |
| */ |
| if (ispa->objtype == ROGUE_TA_OBJTYPE_LINE || |
| ispa->passtype == ROGUE_TA_PASSTYPE_PUNCH_THROUGH || |
| (ispa->dwritedisable && ispa->dcmpmode == ROGUE_TA_CMPMODE_ALWAYS)) { |
| size_info.pds_tri_merge_disable = true; |
| } |
| } |
| |
| pvr_csb_pack (&mask, TA_STATE_PDS_SIZEINFO2, size_info) { |
| size_info.pds_tri_merge_disable = true; |
| } |
| |
| merge_word |= ppp_state->pds.size_info2 & ~mask; |
| |
| if (merge_word != ppp_state->pds.size_info2) { |
| ppp_state->pds.size_info2 = merge_word; |
| header->pres_pds_state_ptr0 = true; |
| } |
| } |
| |
| static void |
| pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer *const cmd_buffer, |
| struct pvr_sub_cmd_gfx *const sub_cmd) |
| { |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| const pco_data *const fs_data = &state->gfx_pipeline->fs_data; |
| |
| const struct pvr_fragment_shader_state *const fragment_shader_state = |
| &state->gfx_pipeline->shader_state.fragment; |
| const struct pvr_stage_allocation_descriptor_state *descriptor_shader_state = |
| &fragment_shader_state->descriptor_state; |
| const struct pvr_pipeline_stage_state *fragment_state = |
| &fragment_shader_state->stage_state; |
| const struct pvr_pds_upload *pds_coeff_program = |
| &fragment_shader_state->pds_coeff_program; |
| |
| const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice; |
| struct ROGUE_TA_STATE_HEADER *const header = &state->emit_header; |
| struct pvr_ppp_state *const ppp_state = &state->ppp_state; |
| |
| const uint32_t pds_uniform_size = |
| DIV_ROUND_UP(descriptor_shader_state->pds_info.data_size_in_dwords, |
| ROGUE_TA_STATE_PDS_SIZEINFO1_PDS_UNIFORMSIZE_UNIT_SIZE); |
| |
| const uint32_t pds_varying_state_size = |
| DIV_ROUND_UP(pds_coeff_program->data_size, |
| ROGUE_TA_STATE_PDS_SIZEINFO1_PDS_VARYINGSIZE_UNIT_SIZE); |
| |
| const uint32_t usc_varying_size = |
| DIV_ROUND_UP(fs_data->common.coeffs, |
| ROGUE_TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE); |
| |
| const uint32_t pds_temp_size = |
| DIV_ROUND_UP(fragment_state->pds_temps_count, |
| ROGUE_TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE); |
| |
| const uint32_t usc_shared_size = |
| DIV_ROUND_UP(fs_data->common.shareds, |
| ROGUE_TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE); |
| |
| const uint32_t max_tiles_in_flight = |
| pvr_calc_fscommon_size_and_tiles_in_flight( |
| &pdevice->dev_info, |
| &pdevice->dev_runtime_info, |
| usc_shared_size * |
| ROGUE_TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE, |
| 1); |
| uint32_t size_info_mask; |
| uint32_t size_info2; |
| |
| if (max_tiles_in_flight < sub_cmd->max_tiles_in_flight) |
| sub_cmd->max_tiles_in_flight = max_tiles_in_flight; |
| |
| pvr_csb_pack (&ppp_state->pds.pixel_shader_base, |
| TA_STATE_PDS_SHADERBASE, |
| shader_base) { |
| const struct pvr_pds_upload *const pds_upload = |
| &fragment_shader_state->pds_fragment_program; |
| |
| shader_base.addr = PVR_DEV_ADDR(pds_upload->data_offset); |
| } |
| |
| if (descriptor_shader_state->pds_code.pvr_bo) { |
| pvr_csb_pack (&ppp_state->pds.texture_uniform_code_base, |
| TA_STATE_PDS_TEXUNICODEBASE, |
| tex_base) { |
| tex_base.addr = |
| PVR_DEV_ADDR(descriptor_shader_state->pds_code.code_offset); |
| } |
| } else { |
| ppp_state->pds.texture_uniform_code_base = 0U; |
| } |
| |
| pvr_csb_pack (&ppp_state->pds.size_info1, TA_STATE_PDS_SIZEINFO1, info1) { |
| info1.pds_uniformsize = pds_uniform_size; |
| info1.pds_texturestatesize = 0U; |
| info1.pds_varyingsize = pds_varying_state_size; |
| info1.usc_varyingsize = usc_varying_size; |
| info1.pds_tempsize = pds_temp_size; |
| } |
| |
| pvr_csb_pack (&size_info_mask, TA_STATE_PDS_SIZEINFO2, mask) { |
| mask.pds_tri_merge_disable = true; |
| } |
| |
| ppp_state->pds.size_info2 &= size_info_mask; |
| |
| pvr_csb_pack (&size_info2, TA_STATE_PDS_SIZEINFO2, info2) { |
| info2.usc_sharedsize = usc_shared_size; |
| } |
| |
| ppp_state->pds.size_info2 |= size_info2; |
| |
| if (pds_coeff_program->pvr_bo) { |
| header->pres_pds_state_ptr1 = true; |
| |
| pvr_csb_pack (&ppp_state->pds.varying_base, |
| TA_STATE_PDS_VARYINGBASE, |
| base) { |
| base.addr = PVR_DEV_ADDR(pds_coeff_program->data_offset); |
| } |
| } else { |
| ppp_state->pds.varying_base = 0U; |
| } |
| |
| pvr_csb_pack (&ppp_state->pds.uniform_state_data_base, |
| TA_STATE_PDS_UNIFORMDATABASE, |
| base) { |
| base.addr = PVR_DEV_ADDR(state->pds_fragment_descriptor_data_offset); |
| } |
| |
| header->pres_pds_state_ptr0 = true; |
| header->pres_pds_state_ptr3 = true; |
| } |
| |
| static void pvr_setup_viewport(struct pvr_cmd_buffer *const cmd_buffer) |
| { |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| struct ROGUE_TA_STATE_HEADER *const header = &state->emit_header; |
| struct vk_dynamic_graphics_state *const dynamic_state = |
| &cmd_buffer->vk.dynamic_graphics_state; |
| struct pvr_ppp_state *const ppp_state = &state->ppp_state; |
| |
| if (ppp_state->viewport_count != dynamic_state->vp.viewport_count) { |
| ppp_state->viewport_count = dynamic_state->vp.viewport_count; |
| header->pres_viewport = true; |
| } |
| |
| if (dynamic_state->rs.rasterizer_discard_enable) { |
| /* We don't want to emit any viewport data as it'll just get thrown |
| * away. It's after the previous condition because we still want to |
| * stash the viewport_count as it's our trigger for when |
| * rasterizer discard gets disabled. |
| */ |
| header->pres_viewport = false; |
| return; |
| } |
| |
| for (uint32_t i = 0; i < ppp_state->viewport_count; i++) { |
| VkViewport *viewport = &dynamic_state->vp.viewports[i]; |
| uint32_t x_scale = fui(viewport->width * 0.5f); |
| uint32_t y_scale = fui(viewport->height * 0.5f); |
| uint32_t z_scale = fui(viewport->maxDepth - viewport->minDepth); |
| uint32_t x_center = fui(viewport->x + viewport->width * 0.5f); |
| uint32_t y_center = fui(viewport->y + viewport->height * 0.5f); |
| uint32_t z_center = fui(viewport->minDepth); |
| |
| if (ppp_state->viewports[i].a0 != x_center || |
| ppp_state->viewports[i].m0 != x_scale || |
| ppp_state->viewports[i].a1 != y_center || |
| ppp_state->viewports[i].m1 != y_scale || |
| ppp_state->viewports[i].a2 != z_center || |
| ppp_state->viewports[i].m2 != z_scale) { |
| ppp_state->viewports[i].a0 = x_center; |
| ppp_state->viewports[i].m0 = x_scale; |
| ppp_state->viewports[i].a1 = y_center; |
| ppp_state->viewports[i].m1 = y_scale; |
| ppp_state->viewports[i].a2 = z_center; |
| ppp_state->viewports[i].m2 = z_scale; |
| |
| header->pres_viewport = true; |
| } |
| } |
| } |
| |
| static void pvr_setup_ppp_control(struct pvr_cmd_buffer *const cmd_buffer) |
| { |
| struct vk_dynamic_graphics_state *const dynamic_state = |
| &cmd_buffer->vk.dynamic_graphics_state; |
| const VkPrimitiveTopology topology = dynamic_state->ia.primitive_topology; |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| struct ROGUE_TA_STATE_HEADER *const header = &state->emit_header; |
| struct pvr_ppp_state *const ppp_state = &state->ppp_state; |
| uint32_t ppp_control; |
| |
| pvr_csb_pack (&ppp_control, TA_STATE_PPP_CTRL, control) { |
| control.drawclippededges = true; |
| control.wclampen = true; |
| |
| if (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN) |
| control.flatshade_vtx = ROGUE_TA_FLATSHADE_VTX_VERTEX_1; |
| else |
| control.flatshade_vtx = ROGUE_TA_FLATSHADE_VTX_VERTEX_0; |
| |
| if (dynamic_state->rs.depth_clamp_enable) |
| control.clip_mode = ROGUE_TA_CLIP_MODE_NO_FRONT_OR_REAR; |
| else |
| control.clip_mode = ROGUE_TA_CLIP_MODE_FRONT_REAR; |
| |
| /* +--- FrontIsCCW? |
| * | +--- Cull Front? |
| * v v |
| * 0|0 CULLMODE_CULL_CCW, |
| * 0|1 CULLMODE_CULL_CW, |
| * 1|0 CULLMODE_CULL_CW, |
| * 1|1 CULLMODE_CULL_CCW, |
| */ |
| switch (dynamic_state->rs.cull_mode) { |
| case VK_CULL_MODE_BACK_BIT: |
| case VK_CULL_MODE_FRONT_BIT: |
| if ((dynamic_state->rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE) ^ |
| (dynamic_state->rs.cull_mode == VK_CULL_MODE_FRONT_BIT)) { |
| control.cullmode = ROGUE_TA_CULLMODE_CULL_CW; |
| } else { |
| control.cullmode = ROGUE_TA_CULLMODE_CULL_CCW; |
| } |
| |
| break; |
| |
| case VK_CULL_MODE_FRONT_AND_BACK: |
| case VK_CULL_MODE_NONE: |
| control.cullmode = ROGUE_TA_CULLMODE_NO_CULLING; |
| break; |
| |
| default: |
| unreachable("Unsupported cull mode!"); |
| } |
| } |
| |
| if (ppp_control != ppp_state->ppp_control) { |
| ppp_state->ppp_control = ppp_control; |
| header->pres_ppp_ctrl = true; |
| } |
| } |
| |
| /* Largest valid PPP State update in words = 31 |
| * 1 - Header |
| * 3 - Stream Out Config words 0, 1 and 2 |
| * 1 - PPP Control word |
| * 3 - Varying Config words 0, 1 and 2 |
| * 1 - Output Select |
| * 1 - WClamp |
| * 6 - Viewport Transform words |
| * 2 - Region Clip words |
| * 3 - PDS State for fragment phase (PDSSTATEPTR 1-3) |
| * 4 - PDS State for fragment phase (PDSSTATEPTR0) |
| * 6 - ISP Control Words |
| */ |
| #define PVR_MAX_PPP_STATE_DWORDS 31 |
| |
| static VkResult pvr_emit_ppp_state(struct pvr_cmd_buffer *const cmd_buffer, |
| struct pvr_sub_cmd_gfx *const sub_cmd) |
| { |
| const bool deferred_secondary = pvr_cmd_uses_deferred_cs_cmds(cmd_buffer); |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| struct ROGUE_TA_STATE_HEADER *const header = &state->emit_header; |
| struct pvr_csb *const control_stream = &sub_cmd->control_stream; |
| struct pvr_ppp_state *const ppp_state = &state->ppp_state; |
| uint32_t ppp_state_words[PVR_MAX_PPP_STATE_DWORDS]; |
| const bool emit_dbsc = header->pres_ispctl_dbsc; |
| uint32_t *buffer_ptr = ppp_state_words; |
| uint32_t dbsc_patching_offset = 0; |
| uint32_t ppp_state_words_count; |
| struct pvr_suballoc_bo *pvr_bo; |
| VkResult result; |
| |
| #if !defined(NDEBUG) |
| struct ROGUE_TA_STATE_HEADER emit_mask = *header; |
| uint32_t packed_emit_mask; |
| |
| static_assert(pvr_cmd_length(TA_STATE_HEADER) == 1, |
| "EMIT_MASK_IS_CLEAR assumes 1 dword sized header."); |
| |
| # define EMIT_MASK_GET(field) (emit_mask.field) |
| # define EMIT_MASK_SET(field, value) (emit_mask.field = (value)) |
| # define EMIT_MASK_IS_CLEAR \ |
| (pvr_cmd_pack(TA_STATE_HEADER)(&packed_emit_mask, &emit_mask), \ |
| packed_emit_mask == 0) |
| #else |
| # define EMIT_MASK_GET(field) |
| # define EMIT_MASK_SET(field, value) |
| #endif |
| |
| header->view_port_count = |
| (ppp_state->viewport_count == 0) ? 0U : (ppp_state->viewport_count - 1); |
| header->pres_ispctl_fa = header->pres_ispctl; |
| |
| /* If deferred_secondary is true then we do a separate state update |
| * which gets patched in vkCmdExecuteCommands(). |
| */ |
| header->pres_ispctl_dbsc &= !deferred_secondary; |
| |
| pvr_csb_write_struct(buffer_ptr, TA_STATE_HEADER, header); |
| |
| static_assert(pvr_cmd_length(TA_STATE_HEADER) == 1, |
| "Following header check assumes 1 dword sized header."); |
| /* If the header is empty we exit early and prevent a bo alloc of 0 size. */ |
| if (ppp_state_words[0] == 0) |
| return VK_SUCCESS; |
| |
| if (header->pres_ispctl) { |
| pvr_csb_write_value(buffer_ptr, TA_STATE_ISPCTL, ppp_state->isp.control); |
| |
| assert(header->pres_ispctl_fa); |
| /* This is not a mistake. FA, BA have the ISPA format, and FB, BB have the |
| * ISPB format. |
| */ |
| pvr_csb_write_value(buffer_ptr, TA_STATE_ISPA, ppp_state->isp.front_a); |
| EMIT_MASK_SET(pres_ispctl_fa, false); |
| |
| if (header->pres_ispctl_fb) { |
| pvr_csb_write_value(buffer_ptr, TA_STATE_ISPB, ppp_state->isp.front_b); |
| EMIT_MASK_SET(pres_ispctl_fb, false); |
| } |
| |
| if (header->pres_ispctl_ba) { |
| pvr_csb_write_value(buffer_ptr, TA_STATE_ISPA, ppp_state->isp.back_a); |
| EMIT_MASK_SET(pres_ispctl_ba, false); |
| } |
| |
| if (header->pres_ispctl_bb) { |
| pvr_csb_write_value(buffer_ptr, TA_STATE_ISPB, ppp_state->isp.back_b); |
| EMIT_MASK_SET(pres_ispctl_bb, false); |
| } |
| |
| EMIT_MASK_SET(pres_ispctl, false); |
| } |
| |
| if (header->pres_ispctl_dbsc) { |
| assert(!deferred_secondary); |
| |
| dbsc_patching_offset = buffer_ptr - ppp_state_words; |
| |
| pvr_csb_pack (buffer_ptr, TA_STATE_ISPDBSC, ispdbsc) { |
| ispdbsc.dbindex = ppp_state->depthbias_scissor_indices.depthbias_index; |
| ispdbsc.scindex = ppp_state->depthbias_scissor_indices.scissor_index; |
| } |
| buffer_ptr += pvr_cmd_length(TA_STATE_ISPDBSC); |
| |
| EMIT_MASK_SET(pres_ispctl_dbsc, false); |
| } |
| |
| if (header->pres_pds_state_ptr0) { |
| pvr_csb_write_value(buffer_ptr, |
| TA_STATE_PDS_SHADERBASE, |
| ppp_state->pds.pixel_shader_base); |
| |
| pvr_csb_write_value(buffer_ptr, |
| TA_STATE_PDS_TEXUNICODEBASE, |
| ppp_state->pds.texture_uniform_code_base); |
| |
| pvr_csb_write_value(buffer_ptr, |
| TA_STATE_PDS_SIZEINFO1, |
| ppp_state->pds.size_info1); |
| pvr_csb_write_value(buffer_ptr, |
| TA_STATE_PDS_SIZEINFO2, |
| ppp_state->pds.size_info2); |
| |
| EMIT_MASK_SET(pres_pds_state_ptr0, false); |
| } |
| |
| if (header->pres_pds_state_ptr1) { |
| pvr_csb_write_value(buffer_ptr, |
| TA_STATE_PDS_VARYINGBASE, |
| ppp_state->pds.varying_base); |
| EMIT_MASK_SET(pres_pds_state_ptr1, false); |
| } |
| |
| /* We don't use pds_state_ptr2 (texture state programs) control word, but |
| * this doesn't mean we need to set it to 0. This is because the hardware |
| * runs the texture state program only when |
| * ROGUE_TA_STATE_PDS_SIZEINFO1.pds_texturestatesize is non-zero. |
| */ |
| assert(pvr_csb_unpack(&ppp_state->pds.size_info1, TA_STATE_PDS_SIZEINFO1) |
| .pds_texturestatesize == 0); |
| |
| if (header->pres_pds_state_ptr3) { |
| pvr_csb_write_value(buffer_ptr, |
| TA_STATE_PDS_UNIFORMDATABASE, |
| ppp_state->pds.uniform_state_data_base); |
| EMIT_MASK_SET(pres_pds_state_ptr3, false); |
| } |
| |
| if (header->pres_region_clip) { |
| pvr_csb_write_value(buffer_ptr, |
| TA_REGION_CLIP0, |
| ppp_state->region_clipping.word0); |
| pvr_csb_write_value(buffer_ptr, |
| TA_REGION_CLIP1, |
| ppp_state->region_clipping.word1); |
| |
| EMIT_MASK_SET(pres_region_clip, false); |
| } |
| |
| if (header->pres_viewport) { |
| const uint32_t viewports = MAX2(1, ppp_state->viewport_count); |
| EMIT_MASK_SET(view_port_count, viewports); |
| |
| for (uint32_t i = 0; i < viewports; i++) { |
| /* These don't have any definitions in the csbgen xml files and none |
| * will be added. |
| */ |
| *buffer_ptr++ = ppp_state->viewports[i].a0; |
| *buffer_ptr++ = ppp_state->viewports[i].m0; |
| *buffer_ptr++ = ppp_state->viewports[i].a1; |
| *buffer_ptr++ = ppp_state->viewports[i].m1; |
| *buffer_ptr++ = ppp_state->viewports[i].a2; |
| *buffer_ptr++ = ppp_state->viewports[i].m2; |
| |
| EMIT_MASK_SET(view_port_count, EMIT_MASK_GET(view_port_count) - 1); |
| } |
| |
| EMIT_MASK_SET(pres_viewport, false); |
| } |
| |
| if (header->pres_wclamp) { |
| pvr_csb_pack (buffer_ptr, TA_WCLAMP, wclamp) { |
| wclamp.val = fui(0.00001f); |
| } |
| buffer_ptr += pvr_cmd_length(TA_WCLAMP); |
| EMIT_MASK_SET(pres_wclamp, false); |
| } |
| |
| if (header->pres_outselects) { |
| pvr_csb_write_value(buffer_ptr, TA_OUTPUT_SEL, ppp_state->output_selects); |
| EMIT_MASK_SET(pres_outselects, false); |
| } |
| |
| if (header->pres_varying_word0) { |
| pvr_csb_write_value(buffer_ptr, |
| TA_STATE_VARYING0, |
| ppp_state->varying_word[0]); |
| EMIT_MASK_SET(pres_varying_word0, false); |
| } |
| |
| if (header->pres_varying_word1) { |
| pvr_csb_write_value(buffer_ptr, |
| TA_STATE_VARYING1, |
| ppp_state->varying_word[1]); |
| EMIT_MASK_SET(pres_varying_word1, false); |
| } |
| |
| /* We only emit this on the first draw of a render job to prevent us from |
| * inheriting a non-zero value set elsewhere. |
| */ |
| if (header->pres_varying_word2) { |
| pvr_csb_write_value(buffer_ptr, TA_STATE_VARYING2, 0); |
| EMIT_MASK_SET(pres_varying_word2, false); |
| } |
| |
| if (header->pres_ppp_ctrl) { |
| pvr_csb_write_value(buffer_ptr, |
| TA_STATE_PPP_CTRL, |
| ppp_state->ppp_control); |
| EMIT_MASK_SET(pres_ppp_ctrl, false); |
| } |
| |
| /* We only emit this on the first draw of a render job to prevent us from |
| * inheriting a non-zero value set elsewhere. |
| */ |
| if (header->pres_stream_out_size) { |
| pvr_csb_write_value(buffer_ptr, TA_STATE_STREAM_OUT0, 0); |
| EMIT_MASK_SET(pres_stream_out_size, false); |
| } |
| |
| assert(EMIT_MASK_IS_CLEAR); |
| |
| #undef EMIT_MASK_GET |
| #undef EMIT_MASK_SET |
| #if !defined(NDEBUG) |
| # undef EMIT_MASK_IS_CLEAR |
| #endif |
| |
| ppp_state_words_count = buffer_ptr - ppp_state_words; |
| assert(ppp_state_words_count <= PVR_MAX_PPP_STATE_DWORDS); |
| |
| result = pvr_cmd_buffer_alloc_mem(cmd_buffer, |
| cmd_buffer->device->heaps.general_heap, |
| PVR_DW_TO_BYTES(ppp_state_words_count), |
| &pvr_bo); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| memcpy(pvr_bo_suballoc_get_map_addr(pvr_bo), |
| ppp_state_words, |
| PVR_DW_TO_BYTES(ppp_state_words_count)); |
| |
| pvr_csb_set_relocation_mark(control_stream); |
| |
| /* Write the VDM state update into the VDM control stream. */ |
| pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE0, state0) { |
| state0.word_count = ppp_state_words_count; |
| state0.addrmsb = pvr_bo->dev_addr; |
| } |
| |
| pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE1, state1) { |
| state1.addrlsb = pvr_bo->dev_addr; |
| } |
| |
| pvr_csb_clear_relocation_mark(control_stream); |
| |
| if (emit_dbsc && cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { |
| struct pvr_deferred_cs_command cmd; |
| |
| if (deferred_secondary) { |
| const uint32_t num_dwords = pvr_cmd_length(VDMCTRL_PPP_STATE0) + |
| pvr_cmd_length(VDMCTRL_PPP_STATE1); |
| uint32_t *vdm_state; |
| |
| pvr_csb_set_relocation_mark(control_stream); |
| |
| vdm_state = pvr_csb_alloc_dwords(control_stream, num_dwords); |
| if (!vdm_state) { |
| result = pvr_csb_get_status(control_stream); |
| return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result); |
| } |
| |
| pvr_csb_clear_relocation_mark(control_stream); |
| |
| cmd = (struct pvr_deferred_cs_command){ |
| .type = PVR_DEFERRED_CS_COMMAND_TYPE_DBSC, |
| .dbsc = { |
| .state = ppp_state->depthbias_scissor_indices, |
| .vdm_state = vdm_state, |
| }, |
| }; |
| } else { |
| cmd = (struct pvr_deferred_cs_command){ |
| .type = PVR_DEFERRED_CS_COMMAND_TYPE_DBSC2, |
| .dbsc2 = { |
| .state = ppp_state->depthbias_scissor_indices, |
| .ppp_cs_bo = pvr_bo, |
| .patch_offset = dbsc_patching_offset, |
| }, |
| }; |
| } |
| |
| util_dynarray_append(&cmd_buffer->deferred_csb_commands, |
| struct pvr_deferred_cs_command, |
| cmd); |
| } |
| |
| state->emit_header = (struct ROGUE_TA_STATE_HEADER){ 0 }; |
| |
| return VK_SUCCESS; |
| } |
| |
| static inline bool |
| pvr_ppp_state_update_required(const struct pvr_cmd_buffer *cmd_buffer) |
| { |
| const BITSET_WORD *const dynamic_dirty = |
| cmd_buffer->vk.dynamic_graphics_state.dirty; |
| const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| const struct ROGUE_TA_STATE_HEADER *const header = &state->emit_header; |
| |
| /* For push constants we only need to worry if they are updated for the |
| * fragment stage since we're only updating the pds programs used in the |
| * fragment stage. |
| */ |
| |
| return header->pres_ppp_ctrl || header->pres_ispctl || |
| header->pres_ispctl_fb || header->pres_ispctl_ba || |
| header->pres_ispctl_bb || header->pres_ispctl_dbsc || |
| header->pres_pds_state_ptr0 || header->pres_pds_state_ptr1 || |
| header->pres_pds_state_ptr2 || header->pres_pds_state_ptr3 || |
| header->pres_region_clip || header->pres_viewport || |
| header->pres_wclamp || header->pres_outselects || |
| header->pres_varying_word0 || header->pres_varying_word1 || |
| header->pres_varying_word2 || header->pres_stream_out_program || |
| state->dirty.fragment_descriptors || state->dirty.vis_test || |
| state->dirty.gfx_pipeline_binding || state->dirty.isp_userpass || |
| state->push_constants.dirty_stages & VK_SHADER_STAGE_FRAGMENT_BIT || |
| BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) || |
| BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) || |
| BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) || |
| BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) || |
| BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) || |
| BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH) || |
| BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_SCISSORS) || |
| BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT) || |
| BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) || |
| BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT); |
| } |
| |
| static VkResult |
| pvr_emit_dirty_ppp_state(struct pvr_cmd_buffer *const cmd_buffer, |
| struct pvr_sub_cmd_gfx *const sub_cmd) |
| { |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| struct vk_dynamic_graphics_state *const dynamic_state = |
| &cmd_buffer->vk.dynamic_graphics_state; |
| VkResult result; |
| |
| /* TODO: The emit_header will be dirty only if |
| * pvr_reset_graphics_dirty_state() was called before this (so when command |
| * buffer begins recording or when it's reset). Otherwise it will have been |
| * zeroed out by the previous pvr_emit_ppp_state(). We can probably set a |
| * flag in there and check it here instead of checking the header. |
| * Check if this is true and implement the flag. |
| */ |
| if (!pvr_ppp_state_update_required(cmd_buffer)) |
| return VK_SUCCESS; |
| |
| if (state->dirty.gfx_pipeline_binding) { |
| struct ROGUE_TA_STATE_ISPA ispa; |
| |
| pvr_setup_output_select(cmd_buffer); |
| pvr_setup_isp_faces_and_control(cmd_buffer, &ispa); |
| pvr_setup_triangle_merging_flag(cmd_buffer, &ispa); |
| } else if (BITSET_TEST(dynamic_state->dirty, |
| MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) || |
| BITSET_TEST(dynamic_state->dirty, |
| MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) || |
| BITSET_TEST(dynamic_state->dirty, |
| MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) || |
| BITSET_TEST(dynamic_state->dirty, |
| MESA_VK_DYNAMIC_RS_LINE_WIDTH) || |
| state->dirty.isp_userpass || state->dirty.vis_test) { |
| pvr_setup_isp_faces_and_control(cmd_buffer, NULL); |
| } |
| |
| if (!dynamic_state->rs.rasterizer_discard_enable && |
| state->dirty.fragment_descriptors && |
| state->gfx_pipeline->shader_state.fragment.bo && |
| !state->gfx_pipeline->fs_data.common.uses.empty) { |
| pvr_setup_fragment_state_pointers(cmd_buffer, sub_cmd); |
| } |
| |
| pvr_setup_isp_depth_bias_scissor_state(cmd_buffer); |
| |
| if (BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) || |
| BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT)) |
| pvr_setup_viewport(cmd_buffer); |
| |
| pvr_setup_ppp_control(cmd_buffer); |
| |
| /* The hardware doesn't have an explicit mode for this so we use a |
| * negative viewport to make sure all objects are culled out early. |
| */ |
| if (dynamic_state->rs.cull_mode == VK_CULL_MODE_FRONT_AND_BACK) { |
| /* Shift the viewport out of the guard-band culling everything. */ |
| const uint32_t negative_vp_val = fui(-2.0f); |
| |
| state->ppp_state.viewports[0].a0 = negative_vp_val; |
| state->ppp_state.viewports[0].m0 = 0; |
| state->ppp_state.viewports[0].a1 = negative_vp_val; |
| state->ppp_state.viewports[0].m1 = 0; |
| state->ppp_state.viewports[0].a2 = negative_vp_val; |
| state->ppp_state.viewports[0].m2 = 0; |
| |
| state->ppp_state.viewport_count = 1; |
| |
| state->emit_header.pres_viewport = true; |
| } |
| |
| result = pvr_emit_ppp_state(cmd_buffer, sub_cmd); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| return VK_SUCCESS; |
| } |
| |
| void pvr_calculate_vertex_cam_size(const struct pvr_device_info *dev_info, |
| const uint32_t vs_output_size, |
| const bool raster_enable, |
| uint32_t *const cam_size_out, |
| uint32_t *const vs_max_instances_out) |
| { |
| /* First work out the size of a vertex in the UVS and multiply by 4 for |
| * column ordering. |
| */ |
| const uint32_t uvs_vertex_vector_size_in_dwords = |
| (vs_output_size + 1U + raster_enable * 4U) * 4U; |
| const uint32_t vdm_cam_size = |
| PVR_GET_FEATURE_VALUE(dev_info, vdm_cam_size, 32U); |
| |
| /* This is a proxy for 8XE. */ |
| if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format) && |
| vdm_cam_size < 96U) { |
| /* Comparisons are based on size including scratch per vertex vector. */ |
| if (uvs_vertex_vector_size_in_dwords < (14U * 4U)) { |
| *cam_size_out = MIN2(31U, vdm_cam_size - 1U); |
| *vs_max_instances_out = 16U; |
| } else if (uvs_vertex_vector_size_in_dwords < (20U * 4U)) { |
| *cam_size_out = 15U; |
| *vs_max_instances_out = 16U; |
| } else if (uvs_vertex_vector_size_in_dwords < (28U * 4U)) { |
| *cam_size_out = 11U; |
| *vs_max_instances_out = 12U; |
| } else if (uvs_vertex_vector_size_in_dwords < (44U * 4U)) { |
| *cam_size_out = 7U; |
| *vs_max_instances_out = 8U; |
| } else if (PVR_HAS_FEATURE(dev_info, |
| simple_internal_parameter_format_v2) || |
| uvs_vertex_vector_size_in_dwords < (64U * 4U)) { |
| *cam_size_out = 7U; |
| *vs_max_instances_out = 4U; |
| } else { |
| *cam_size_out = 3U; |
| *vs_max_instances_out = 2U; |
| } |
| } else { |
| /* Comparisons are based on size including scratch per vertex vector. */ |
| if (uvs_vertex_vector_size_in_dwords <= (32U * 4U)) { |
| /* output size <= 27 + 5 scratch. */ |
| *cam_size_out = MIN2(95U, vdm_cam_size - 1U); |
| *vs_max_instances_out = 0U; |
| } else if (uvs_vertex_vector_size_in_dwords <= 48U * 4U) { |
| /* output size <= 43 + 5 scratch */ |
| *cam_size_out = 63U; |
| if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U) |
| *vs_max_instances_out = 16U; |
| else |
| *vs_max_instances_out = 0U; |
| } else if (uvs_vertex_vector_size_in_dwords <= 64U * 4U) { |
| /* output size <= 59 + 5 scratch. */ |
| *cam_size_out = 31U; |
| if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U) |
| *vs_max_instances_out = 16U; |
| else |
| *vs_max_instances_out = 0U; |
| } else { |
| *cam_size_out = 15U; |
| *vs_max_instances_out = 16U; |
| } |
| } |
| } |
| |
| static void pvr_emit_dirty_vdm_state(struct pvr_cmd_buffer *const cmd_buffer, |
| struct pvr_sub_cmd_gfx *const sub_cmd) |
| { |
| /* FIXME: Assume all state is dirty for the moment. */ |
| struct pvr_device_info *const dev_info = |
| &cmd_buffer->device->pdevice->dev_info; |
| ASSERTED const uint32_t max_user_vertex_output_components = |
| pvr_get_max_user_vertex_output_components(dev_info); |
| struct ROGUE_VDMCTRL_VDM_STATE0 header = { pvr_cmd_header( |
| VDMCTRL_VDM_STATE0) }; |
| struct vk_dynamic_graphics_state *const dynamic_state = |
| &cmd_buffer->vk.dynamic_graphics_state; |
| const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| const pco_data *const vs_data = &state->gfx_pipeline->vs_data; |
| struct pvr_csb *const csb = &sub_cmd->control_stream; |
| uint32_t max_instances; |
| uint32_t cam_size; |
| |
| /* CAM Calculations and HW state take vertex size aligned to DWORDS. */ |
| assert(vs_data->vs.vtxouts <= max_user_vertex_output_components); |
| |
| pvr_calculate_vertex_cam_size(dev_info, |
| vs_data->vs.vtxouts, |
| true, |
| &cam_size, |
| &max_instances); |
| |
| pvr_csb_set_relocation_mark(csb); |
| |
| pvr_csb_emit (csb, VDMCTRL_VDM_STATE0, state0) { |
| state0.cam_size = cam_size; |
| |
| if (dynamic_state->ia.primitive_restart_enable) { |
| state0.cut_index_enable = true; |
| state0.cut_index_present = true; |
| } |
| |
| switch (dynamic_state->ia.primitive_topology) { |
| case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN: |
| state0.flatshade_control = ROGUE_VDMCTRL_FLATSHADE_CONTROL_VERTEX_1; |
| break; |
| |
| default: |
| state0.flatshade_control = ROGUE_VDMCTRL_FLATSHADE_CONTROL_VERTEX_0; |
| break; |
| } |
| |
| /* If we've bound a different vertex buffer, or this draw-call requires |
| * a different PDS attrib data-section from the last draw call (changed |
| * base_instance) then we need to specify a new data section. This is |
| * also the case if we've switched pipeline or attrib program as the |
| * data-section layout will be different. |
| */ |
| state0.vs_data_addr_present = |
| state->dirty.gfx_pipeline_binding || state->dirty.vertex_bindings || |
| state->dirty.draw_base_instance || state->dirty.draw_variant; |
| |
| /* Need to specify new PDS Attrib program if we've bound a different |
| * pipeline or we needed a different PDS Attrib variant for this |
| * draw-call. |
| */ |
| state0.vs_other_present = state->dirty.gfx_pipeline_binding || |
| state->dirty.draw_variant; |
| |
| /* UVB_SCRATCH_SELECT_ONE with no rasterization is only valid when |
| * stream output is enabled. We use UVB_SCRATCH_SELECT_FIVE because |
| * Vulkan doesn't support stream output and the vertex position is |
| * always emitted to the UVB. |
| */ |
| state0.uvs_scratch_size_select = |
| ROGUE_VDMCTRL_UVS_SCRATCH_SIZE_SELECT_FIVE; |
| |
| header = state0; |
| } |
| |
| if (header.cut_index_present) { |
| pvr_csb_emit (csb, VDMCTRL_VDM_STATE1, state1) { |
| state1.cut_index = |
| vk_index_to_restart(state->index_buffer_binding.type); |
| } |
| } |
| |
| if (header.vs_data_addr_present) { |
| pvr_csb_emit (csb, VDMCTRL_VDM_STATE2, state2) { |
| state2.vs_pds_data_base_addr = |
| PVR_DEV_ADDR(state->pds_vertex_attrib_offset); |
| } |
| } |
| |
| if (header.vs_other_present) { |
| const uint32_t usc_unified_store_size_in_bytes = vs_data->common.vtxins |
| << 2; |
| |
| pvr_csb_emit (csb, VDMCTRL_VDM_STATE3, state3) { |
| state3.vs_pds_code_base_addr = |
| PVR_DEV_ADDR(state->pds_shader.code_offset); |
| } |
| |
| pvr_csb_emit (csb, VDMCTRL_VDM_STATE4, state4) { |
| state4.vs_output_size = vs_data->vs.vtxouts; |
| } |
| |
| pvr_csb_emit (csb, VDMCTRL_VDM_STATE5, state5) { |
| state5.vs_max_instances = max_instances; |
| state5.vs_usc_common_size = 0U; |
| state5.vs_usc_unified_size = DIV_ROUND_UP( |
| usc_unified_store_size_in_bytes, |
| ROGUE_VDMCTRL_VDM_STATE5_VS_USC_UNIFIED_SIZE_UNIT_SIZE); |
| state5.vs_pds_temp_size = |
| DIV_ROUND_UP(state->pds_shader.info->temps_required << 2, |
| ROGUE_VDMCTRL_VDM_STATE5_VS_PDS_TEMP_SIZE_UNIT_SIZE); |
| state5.vs_pds_data_size = DIV_ROUND_UP( |
| PVR_DW_TO_BYTES(state->pds_shader.info->data_size_in_dwords), |
| ROGUE_VDMCTRL_VDM_STATE5_VS_PDS_DATA_SIZE_UNIT_SIZE); |
| } |
| } |
| |
| pvr_csb_clear_relocation_mark(csb); |
| } |
| |
| static VkResult pvr_validate_draw_state(struct pvr_cmd_buffer *cmd_buffer) |
| { |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| struct vk_dynamic_graphics_state *const dynamic_state = |
| &cmd_buffer->vk.dynamic_graphics_state; |
| const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline; |
| const pco_data *const fs_data = &gfx_pipeline->fs_data; |
| struct pvr_sub_cmd_gfx *sub_cmd; |
| bool fstencil_writemask_zero; |
| bool bstencil_writemask_zero; |
| bool fstencil_keep; |
| bool bstencil_keep; |
| VkResult result; |
| |
| pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS); |
| |
| sub_cmd = &state->current_sub_cmd->gfx; |
| sub_cmd->empty_cmd = false; |
| |
| /* Determine pipeline depth/stencil usage. If a pipeline uses depth or |
| * stencil testing, those attachments are using their loaded values, and |
| * the loadOps cannot be optimized out. |
| */ |
| /* Pipeline uses depth testing. */ |
| if (sub_cmd->depth_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED && |
| dynamic_state->ds.depth.compare_op != VK_COMPARE_OP_ALWAYS) { |
| sub_cmd->depth_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED; |
| } |
| |
| /* Pipeline uses stencil testing. */ |
| if (sub_cmd->stencil_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED && |
| (dynamic_state->ds.stencil.front.op.compare != VK_COMPARE_OP_ALWAYS || |
| dynamic_state->ds.stencil.back.op.compare != VK_COMPARE_OP_ALWAYS)) { |
| sub_cmd->stencil_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED; |
| } |
| |
| if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info, |
| compute_overlap)) { |
| uint32_t coefficient_size = |
| DIV_ROUND_UP(fs_data->common.coeffs, |
| ROGUE_TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE); |
| |
| if (coefficient_size > |
| ROGUE_TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_MAX_SIZE) |
| sub_cmd->disable_compute_overlap = true; |
| } |
| |
| sub_cmd->frag_uses_atomic_ops |= fs_data->common.uses.atomics; |
| sub_cmd->frag_has_side_effects |= fs_data->common.uses.side_effects; |
| sub_cmd->frag_uses_texture_rw |= false; |
| sub_cmd->vertex_uses_texture_rw |= false; |
| |
| sub_cmd->job.get_vis_results = state->vis_test_enabled; |
| |
| fstencil_keep = |
| (dynamic_state->ds.stencil.front.op.fail == VK_STENCIL_OP_KEEP) && |
| (dynamic_state->ds.stencil.front.op.pass == VK_STENCIL_OP_KEEP); |
| bstencil_keep = |
| (dynamic_state->ds.stencil.back.op.fail == VK_STENCIL_OP_KEEP) && |
| (dynamic_state->ds.stencil.back.op.pass == VK_STENCIL_OP_KEEP); |
| fstencil_writemask_zero = (dynamic_state->ds.stencil.front.write_mask == 0); |
| bstencil_writemask_zero = (dynamic_state->ds.stencil.back.write_mask == 0); |
| |
| /* Set stencil modified flag if: |
| * - Neither front nor back-facing stencil has a fail_op/pass_op of KEEP. |
| * - Neither front nor back-facing stencil has a write_mask of zero. |
| */ |
| if (!(fstencil_keep && bstencil_keep) && |
| !(fstencil_writemask_zero && bstencil_writemask_zero)) { |
| sub_cmd->modifies_stencil = true; |
| } |
| |
| /* Set depth modified flag if depth write is enabled. */ |
| if (dynamic_state->ds.depth.write_enable) |
| sub_cmd->modifies_depth = true; |
| |
| /* If either the data or code changes for pds vertex attribs, regenerate the |
| * data segment. |
| */ |
| if (state->dirty.vertex_bindings || state->dirty.gfx_pipeline_binding || |
| state->dirty.draw_variant || state->dirty.draw_base_instance) { |
| enum pvr_pds_vertex_attrib_program_type prog_type; |
| const struct pvr_pds_attrib_program *program; |
| |
| if (state->draw_state.draw_indirect) |
| prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT; |
| else if (state->draw_state.base_instance) |
| prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE; |
| else |
| prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC; |
| |
| program = |
| &gfx_pipeline->shader_state.vertex.pds_attrib_programs[prog_type]; |
| state->pds_shader.info = &program->info; |
| state->pds_shader.code_offset = program->program.code_offset; |
| |
| state->max_shared_regs = |
| MAX2(state->max_shared_regs, pvr_calc_shared_regs_count(gfx_pipeline)); |
| |
| pvr_setup_vertex_buffers(cmd_buffer, gfx_pipeline); |
| } |
| |
| if (state->push_constants.dirty_stages & VK_SHADER_STAGE_ALL_GRAPHICS) { |
| result = pvr_cmd_upload_push_consts(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return result; |
| } |
| |
| state->dirty.vertex_descriptors = state->dirty.gfx_pipeline_binding; |
| state->dirty.fragment_descriptors = state->dirty.vertex_descriptors; |
| |
| /* Account for dirty descriptor set. */ |
| /* TODO: It could be the case that there are no descriptors for a specific |
| * stage, or that the update descriptors aren't active for a particular |
| * stage. In such cases we could avoid regenerating the descriptor PDS |
| * program. |
| */ |
| state->dirty.vertex_descriptors |= state->dirty.gfx_desc_dirty; |
| state->dirty.fragment_descriptors |= state->dirty.gfx_desc_dirty; |
| |
| if (BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) |
| state->dirty.fragment_descriptors = true; |
| |
| state->dirty.vertex_descriptors |= |
| state->push_constants.dirty_stages & |
| (VK_SHADER_STAGE_ALL_GRAPHICS & ~VK_SHADER_STAGE_FRAGMENT_BIT); |
| state->dirty.fragment_descriptors |= state->push_constants.dirty_stages & |
| VK_SHADER_STAGE_FRAGMENT_BIT; |
| |
| if (state->dirty.fragment_descriptors) { |
| result = pvr_setup_descriptor_mappings( |
| cmd_buffer, |
| PVR_STAGE_ALLOCATION_FRAGMENT, |
| &state->gfx_pipeline->shader_state.fragment.descriptor_state, |
| NULL, |
| &state->pds_fragment_descriptor_data_offset); |
| if (result != VK_SUCCESS) { |
| mesa_loge("Could not setup fragment descriptor mappings."); |
| return result; |
| } |
| } |
| |
| if (state->dirty.vertex_descriptors) { |
| uint32_t pds_vertex_descriptor_data_offset; |
| |
| result = pvr_setup_descriptor_mappings( |
| cmd_buffer, |
| PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY, |
| &state->gfx_pipeline->shader_state.vertex.descriptor_state, |
| NULL, |
| &pds_vertex_descriptor_data_offset); |
| if (result != VK_SUCCESS) { |
| mesa_loge("Could not setup vertex descriptor mappings."); |
| return result; |
| } |
| |
| pvr_emit_dirty_pds_state(cmd_buffer, |
| sub_cmd, |
| pds_vertex_descriptor_data_offset); |
| } |
| |
| pvr_emit_dirty_ppp_state(cmd_buffer, sub_cmd); |
| pvr_emit_dirty_vdm_state(cmd_buffer, sub_cmd); |
| |
| vk_dynamic_graphics_state_clear_dirty(dynamic_state); |
| state->dirty.gfx_desc_dirty = false; |
| state->dirty.draw_base_instance = false; |
| state->dirty.draw_variant = false; |
| state->dirty.fragment_descriptors = false; |
| state->dirty.gfx_pipeline_binding = false; |
| state->dirty.isp_userpass = false; |
| state->dirty.vertex_bindings = false; |
| state->dirty.vis_test = false; |
| |
| state->push_constants.dirty_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS; |
| |
| return VK_SUCCESS; |
| } |
| |
| static uint32_t pvr_get_hw_primitive_topology(VkPrimitiveTopology topology) |
| { |
| switch (topology) { |
| case VK_PRIMITIVE_TOPOLOGY_POINT_LIST: |
| return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_POINT_LIST; |
| case VK_PRIMITIVE_TOPOLOGY_LINE_LIST: |
| return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST; |
| case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP: |
| return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP; |
| case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST: |
| return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST; |
| case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP: |
| return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP; |
| case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN: |
| return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_FAN; |
| case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: |
| return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST_ADJ; |
| case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY: |
| return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP_ADJ; |
| case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY: |
| return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST_ADJ; |
| case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY: |
| return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP_ADJ; |
| case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST: |
| return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_PATCH_LIST; |
| default: |
| unreachable("Undefined primitive topology"); |
| } |
| } |
| |
| /* TODO: Rewrite this in terms of ALIGN_POT() and pvr_cmd_length(). */ |
| /* Aligned to 128 bit for PDS loads / stores */ |
| #define DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE 8 |
| |
| static VkResult |
| pvr_write_draw_indirect_vdm_stream(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_csb *const csb, |
| pvr_dev_addr_t idx_buffer_addr, |
| uint32_t idx_stride, |
| struct ROGUE_VDMCTRL_INDEX_LIST0 *list_hdr, |
| struct pvr_buffer *buffer, |
| VkDeviceSize offset, |
| uint32_t count, |
| uint32_t stride) |
| { |
| struct pvr_pds_drawindirect_program pds_prog = { 0 }; |
| uint32_t word0; |
| |
| /* Draw indirect always has index offset and instance count. */ |
| list_hdr->index_offset_present = true; |
| list_hdr->index_instance_count_present = true; |
| |
| pvr_cmd_pack(VDMCTRL_INDEX_LIST0)(&word0, list_hdr); |
| |
| pds_prog.support_base_instance = true; |
| pds_prog.arg_buffer = buffer->dev_addr.addr + offset; |
| pds_prog.index_buffer = idx_buffer_addr.addr; |
| pds_prog.index_block_header = word0; |
| pds_prog.index_stride = idx_stride; |
| pds_prog.num_views = 1U; |
| |
| /* TODO: See if we can pre-upload the code section of all the pds programs |
| * and reuse them here. |
| */ |
| /* Generate and upload the PDS programs (code + data). */ |
| for (uint32_t i = 0U; i < count; i++) { |
| const struct pvr_device_info *dev_info = |
| &cmd_buffer->device->pdevice->dev_info; |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| struct pvr_suballoc_bo *dummy_bo; |
| struct pvr_suballoc_bo *pds_bo; |
| uint32_t *dummy_stream; |
| uint32_t *pds_base; |
| uint32_t pds_size; |
| VkResult result; |
| |
| /* TODO: Move this outside the loop and allocate all of them in one go? */ |
| result = pvr_cmd_buffer_alloc_mem(cmd_buffer, |
| cmd_buffer->device->heaps.general_heap, |
| DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE, |
| &dummy_bo); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| pds_prog.increment_draw_id = (i != 0); |
| pds_prog.index_list_addr_buffer = dummy_bo->dev_addr.addr; |
| |
| if (state->draw_state.draw_indexed) { |
| pvr_pds_generate_draw_elements_indirect(&pds_prog, |
| 0, |
| PDS_GENERATE_SIZES, |
| dev_info); |
| } else { |
| pvr_pds_generate_draw_arrays_indirect(&pds_prog, |
| 0, |
| PDS_GENERATE_SIZES, |
| dev_info); |
| } |
| |
| pds_size = PVR_DW_TO_BYTES(pds_prog.program.data_size_aligned + |
| pds_prog.program.code_size_aligned); |
| |
| result = pvr_cmd_buffer_alloc_mem(cmd_buffer, |
| cmd_buffer->device->heaps.pds_heap, |
| pds_size, |
| &pds_bo); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| pds_base = pvr_bo_suballoc_get_map_addr(pds_bo); |
| memcpy(pds_base, |
| pds_prog.program.code, |
| PVR_DW_TO_BYTES(pds_prog.program.code_size_aligned)); |
| |
| if (state->draw_state.draw_indexed) { |
| pvr_pds_generate_draw_elements_indirect( |
| &pds_prog, |
| pds_base + pds_prog.program.code_size_aligned, |
| PDS_GENERATE_DATA_SEGMENT, |
| dev_info); |
| } else { |
| pvr_pds_generate_draw_arrays_indirect( |
| &pds_prog, |
| pds_base + pds_prog.program.code_size_aligned, |
| PDS_GENERATE_DATA_SEGMENT, |
| dev_info); |
| } |
| |
| pvr_csb_set_relocation_mark(csb); |
| |
| pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) { |
| state0.usc_target = ROGUE_VDMCTRL_USC_TARGET_ANY; |
| |
| state0.pds_temp_size = |
| DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_prog.program.temp_size_aligned), |
| ROGUE_VDMCTRL_PDS_STATE0_PDS_TEMP_SIZE_UNIT_SIZE); |
| |
| state0.pds_data_size = |
| DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_prog.program.data_size_aligned), |
| ROGUE_VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE); |
| } |
| |
| pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) { |
| const uint32_t data_offset = |
| pds_bo->dev_addr.addr + |
| PVR_DW_TO_BYTES(pds_prog.program.code_size_aligned) - |
| cmd_buffer->device->heaps.pds_heap->base_addr.addr; |
| |
| state1.pds_data_addr = PVR_DEV_ADDR(data_offset); |
| state1.sd_type = ROGUE_VDMCTRL_SD_TYPE_PDS; |
| state1.sd_next_type = ROGUE_VDMCTRL_SD_TYPE_NONE; |
| } |
| |
| pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) { |
| const uint32_t code_offset = |
| pds_bo->dev_addr.addr - |
| cmd_buffer->device->heaps.pds_heap->base_addr.addr; |
| |
| state2.pds_code_addr = PVR_DEV_ADDR(code_offset); |
| } |
| |
| pvr_csb_clear_relocation_mark(csb); |
| |
| /* We don't really need to set the relocation mark since the following |
| * state update is just one emit but let's be nice and use it. |
| */ |
| pvr_csb_set_relocation_mark(csb); |
| |
| /* Sync task to ensure the VDM doesn't start reading the dummy blocks |
| * before they are ready. |
| */ |
| pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) { |
| list0.primitive_topology = ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST; |
| } |
| |
| pvr_csb_clear_relocation_mark(csb); |
| |
| dummy_stream = pvr_bo_suballoc_get_map_addr(dummy_bo); |
| |
| /* For indexed draw cmds fill in the dummy's header (as it won't change |
| * based on the indirect args) and increment by the in-use size of each |
| * dummy block. |
| */ |
| if (!state->draw_state.draw_indexed) { |
| dummy_stream[0] = word0; |
| dummy_stream += 4; |
| } else { |
| dummy_stream += 5; |
| } |
| |
| /* clang-format off */ |
| pvr_csb_pack (dummy_stream, VDMCTRL_STREAM_RETURN, word); |
| /* clang-format on */ |
| |
| pvr_csb_set_relocation_mark(csb); |
| |
| /* Stream link to the first dummy which forces the VDM to discard any |
| * prefetched (dummy) control stream. |
| */ |
| pvr_csb_emit (csb, VDMCTRL_STREAM_LINK0, link) { |
| link.with_return = true; |
| link.link_addrmsb = dummy_bo->dev_addr; |
| } |
| |
| pvr_csb_emit (csb, VDMCTRL_STREAM_LINK1, link) { |
| link.link_addrlsb = dummy_bo->dev_addr; |
| } |
| |
| pvr_csb_clear_relocation_mark(csb); |
| |
| /* Point the pds program to the next argument buffer and the next VDM |
| * dummy buffer. |
| */ |
| pds_prog.arg_buffer += stride; |
| } |
| |
| return VK_SUCCESS; |
| } |
| |
| #undef DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE |
| |
| static void pvr_emit_vdm_index_list(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd_gfx *const sub_cmd, |
| VkPrimitiveTopology topology, |
| uint32_t index_offset, |
| uint32_t first_index, |
| uint32_t index_count, |
| uint32_t instance_count, |
| struct pvr_buffer *buffer, |
| VkDeviceSize offset, |
| uint32_t count, |
| uint32_t stride) |
| { |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| |
| const pco_data *const vs_data = &state->gfx_pipeline->vs_data; |
| struct ROGUE_VDMCTRL_INDEX_LIST0 list_hdr = { pvr_cmd_header( |
| VDMCTRL_INDEX_LIST0) }; |
| pvr_dev_addr_t index_buffer_addr = PVR_DEV_ADDR_INVALID; |
| struct pvr_csb *const csb = &sub_cmd->control_stream; |
| unsigned int index_stride = 0; |
| |
| list_hdr.primitive_topology = pvr_get_hw_primitive_topology(topology); |
| |
| /* firstInstance is not handled here in the VDM state, it's implemented as |
| * an addition in the PDS vertex fetch using |
| * PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE entry type. |
| */ |
| |
| list_hdr.index_count_present = true; |
| |
| if (instance_count > 1) |
| list_hdr.index_instance_count_present = true; |
| |
| if (index_offset) |
| list_hdr.index_offset_present = true; |
| |
| if (state->draw_state.draw_indexed) { |
| list_hdr.index_size = |
| pvr_vdmctrl_index_size_from_type(state->index_buffer_binding.type); |
| index_stride = vk_index_type_to_bytes(state->index_buffer_binding.type); |
| |
| index_buffer_addr = PVR_DEV_ADDR_OFFSET( |
| state->index_buffer_binding.buffer->dev_addr, |
| state->index_buffer_binding.offset + first_index * index_stride); |
| |
| list_hdr.index_addr_present = true; |
| list_hdr.index_base_addrmsb = index_buffer_addr; |
| } |
| |
| list_hdr.degen_cull_enable = |
| PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info, |
| vdm_degenerate_culling) && |
| !vs_data->common.uses.side_effects; |
| |
| if (state->draw_state.draw_indirect) { |
| assert(buffer); |
| pvr_write_draw_indirect_vdm_stream(cmd_buffer, |
| csb, |
| index_buffer_addr, |
| index_stride, |
| &list_hdr, |
| buffer, |
| offset, |
| count, |
| stride); |
| return; |
| } |
| |
| pvr_csb_set_relocation_mark(csb); |
| |
| pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) { |
| list0 = list_hdr; |
| } |
| |
| if (list_hdr.index_addr_present) { |
| pvr_csb_emit (csb, VDMCTRL_INDEX_LIST1, list1) { |
| list1.index_base_addrlsb = index_buffer_addr; |
| } |
| } |
| |
| if (list_hdr.index_count_present) { |
| pvr_csb_emit (csb, VDMCTRL_INDEX_LIST2, list2) { |
| list2.index_count = index_count; |
| } |
| } |
| |
| if (list_hdr.index_instance_count_present) { |
| pvr_csb_emit (csb, VDMCTRL_INDEX_LIST3, list3) { |
| list3.instance_count = instance_count - 1; |
| } |
| } |
| |
| if (list_hdr.index_offset_present) { |
| pvr_csb_emit (csb, VDMCTRL_INDEX_LIST4, list4) { |
| list4.index_offset = index_offset; |
| } |
| } |
| |
| pvr_csb_clear_relocation_mark(csb); |
| } |
| |
| void pvr_CmdDraw(VkCommandBuffer commandBuffer, |
| uint32_t vertexCount, |
| uint32_t instanceCount, |
| uint32_t firstVertex, |
| uint32_t firstInstance) |
| { |
| const struct pvr_cmd_buffer_draw_state draw_state = { |
| .base_vertex = firstVertex, |
| .base_instance = firstInstance, |
| }; |
| |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct vk_dynamic_graphics_state *const dynamic_state = |
| &cmd_buffer->vk.dynamic_graphics_state; |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| pvr_update_draw_state(state, &draw_state); |
| |
| result = pvr_validate_draw_state(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return; |
| |
| /* Write the VDM control stream for the primitive. */ |
| pvr_emit_vdm_index_list(cmd_buffer, |
| &state->current_sub_cmd->gfx, |
| dynamic_state->ia.primitive_topology, |
| firstVertex, |
| 0U, |
| vertexCount, |
| instanceCount, |
| NULL, |
| 0U, |
| 0U, |
| 0U); |
| } |
| |
| void pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer, |
| uint32_t indexCount, |
| uint32_t instanceCount, |
| uint32_t firstIndex, |
| int32_t vertexOffset, |
| uint32_t firstInstance) |
| { |
| const struct pvr_cmd_buffer_draw_state draw_state = { |
| .base_vertex = vertexOffset, |
| .base_instance = firstInstance, |
| .draw_indexed = true, |
| }; |
| |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct vk_dynamic_graphics_state *const dynamic_state = |
| &cmd_buffer->vk.dynamic_graphics_state; |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| pvr_update_draw_state(state, &draw_state); |
| |
| result = pvr_validate_draw_state(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return; |
| |
| /* Write the VDM control stream for the primitive. */ |
| pvr_emit_vdm_index_list(cmd_buffer, |
| &state->current_sub_cmd->gfx, |
| dynamic_state->ia.primitive_topology, |
| vertexOffset, |
| firstIndex, |
| indexCount, |
| instanceCount, |
| NULL, |
| 0U, |
| 0U, |
| 0U); |
| } |
| |
| void pvr_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, |
| VkBuffer _buffer, |
| VkDeviceSize offset, |
| uint32_t drawCount, |
| uint32_t stride) |
| { |
| const struct pvr_cmd_buffer_draw_state draw_state = { |
| .draw_indirect = true, |
| .draw_indexed = true, |
| }; |
| |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| struct vk_dynamic_graphics_state *const dynamic_state = |
| &cmd_buffer->vk.dynamic_graphics_state; |
| PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer); |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| pvr_update_draw_state(state, &draw_state); |
| |
| result = pvr_validate_draw_state(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return; |
| |
| /* Write the VDM control stream for the primitive. */ |
| pvr_emit_vdm_index_list(cmd_buffer, |
| &state->current_sub_cmd->gfx, |
| dynamic_state->ia.primitive_topology, |
| 0U, |
| 0U, |
| 0U, |
| 0U, |
| buffer, |
| offset, |
| drawCount, |
| stride); |
| } |
| |
| void pvr_CmdDrawIndirect(VkCommandBuffer commandBuffer, |
| VkBuffer _buffer, |
| VkDeviceSize offset, |
| uint32_t drawCount, |
| uint32_t stride) |
| { |
| const struct pvr_cmd_buffer_draw_state draw_state = { |
| .draw_indirect = true, |
| }; |
| |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer); |
| struct vk_dynamic_graphics_state *const dynamic_state = |
| &cmd_buffer->vk.dynamic_graphics_state; |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| pvr_update_draw_state(state, &draw_state); |
| |
| result = pvr_validate_draw_state(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return; |
| |
| /* Write the VDM control stream for the primitive. */ |
| pvr_emit_vdm_index_list(cmd_buffer, |
| &state->current_sub_cmd->gfx, |
| dynamic_state->ia.primitive_topology, |
| 0U, |
| 0U, |
| 0U, |
| 0U, |
| buffer, |
| offset, |
| drawCount, |
| stride); |
| } |
| |
| static VkResult |
| pvr_resolve_unemitted_resolve_attachments(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_render_pass_info *info) |
| { |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| const struct pvr_renderpass_hwsetup_render *hw_render = |
| &state->render_pass_info.pass->hw_setup->renders[info->current_hw_subpass]; |
| |
| for (uint32_t i = 0U; i < hw_render->eot_surface_count; i++) { |
| const struct pvr_renderpass_hwsetup_eot_surface *surface = |
| &hw_render->eot_surfaces[i]; |
| const uint32_t color_attach_idx = surface->src_attachment_idx; |
| const uint32_t resolve_attach_idx = surface->attachment_idx; |
| VkImageSubresourceLayers src_subresource; |
| VkImageSubresourceLayers dst_subresource; |
| struct pvr_image_view *dst_view; |
| struct pvr_image_view *src_view; |
| VkFormat src_format; |
| VkFormat dst_format; |
| VkImageCopy2 region; |
| VkResult result; |
| |
| if (!surface->need_resolve || |
| surface->resolve_type != PVR_RESOLVE_TYPE_TRANSFER) |
| continue; |
| |
| dst_view = info->attachments[resolve_attach_idx]; |
| src_view = info->attachments[color_attach_idx]; |
| |
| src_subresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| src_subresource.mipLevel = src_view->vk.base_mip_level; |
| src_subresource.baseArrayLayer = src_view->vk.base_array_layer; |
| src_subresource.layerCount = src_view->vk.layer_count; |
| |
| dst_subresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| dst_subresource.mipLevel = dst_view->vk.base_mip_level; |
| dst_subresource.baseArrayLayer = dst_view->vk.base_array_layer; |
| dst_subresource.layerCount = dst_view->vk.layer_count; |
| |
| region.srcOffset = (VkOffset3D){ info->render_area.offset.x, |
| info->render_area.offset.y, |
| 0 }; |
| region.dstOffset = (VkOffset3D){ info->render_area.offset.x, |
| info->render_area.offset.y, |
| 0 }; |
| region.extent = (VkExtent3D){ info->render_area.extent.width, |
| info->render_area.extent.height, |
| 1 }; |
| |
| region.srcSubresource = src_subresource; |
| region.dstSubresource = dst_subresource; |
| |
| /* TODO: if ERN_46863 is supported, Depth and stencil are sampled |
| * separately from images with combined depth+stencil. Add logic here to |
| * handle it using appropriate format from image view. |
| */ |
| src_format = src_view->vk.image->format; |
| dst_format = dst_view->vk.image->format; |
| src_view->vk.image->format = src_view->vk.format; |
| dst_view->vk.image->format = dst_view->vk.format; |
| |
| result = pvr_copy_or_resolve_color_image_region( |
| cmd_buffer, |
| vk_to_pvr_image(src_view->vk.image), |
| vk_to_pvr_image(dst_view->vk.image), |
| ®ion); |
| |
| src_view->vk.image->format = src_format; |
| dst_view->vk.image->format = dst_format; |
| |
| state->current_sub_cmd->transfer.serialize_with_frag = true; |
| |
| if (result != VK_SUCCESS) |
| return result; |
| } |
| |
| return pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| } |
| |
| void pvr_CmdEndRenderPass2(VkCommandBuffer commandBuffer, |
| const VkSubpassEndInfo *pSubpassEndInfo) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| struct pvr_image_view **attachments; |
| VkClearValue *clear_values; |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| assert(state->render_pass_info.pass); |
| assert(state->render_pass_info.framebuffer); |
| |
| result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return; |
| |
| result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer, |
| &state->render_pass_info); |
| if (result != VK_SUCCESS) |
| return; |
| |
| /* Save the required fields before clearing render_pass_info struct. */ |
| attachments = state->render_pass_info.attachments; |
| clear_values = state->render_pass_info.clear_values; |
| |
| memset(&state->render_pass_info, 0, sizeof(state->render_pass_info)); |
| |
| state->render_pass_info.attachments = attachments; |
| state->render_pass_info.clear_values = clear_values; |
| } |
| |
| static VkResult |
| pvr_execute_deferred_cmd_buffer(struct pvr_cmd_buffer *cmd_buffer, |
| const struct pvr_cmd_buffer *sec_cmd_buffer) |
| { |
| struct vk_dynamic_graphics_state *const dynamic_state = |
| &cmd_buffer->vk.dynamic_graphics_state; |
| const uint32_t prim_db_elems = |
| util_dynarray_num_elements(&cmd_buffer->depth_bias_array, |
| struct pvr_depth_bias_state); |
| const uint32_t prim_scissor_elems = |
| util_dynarray_num_elements(&cmd_buffer->scissor_array, |
| struct pvr_scissor_words); |
| |
| util_dynarray_foreach (&sec_cmd_buffer->deferred_csb_commands, |
| struct pvr_deferred_cs_command, |
| cmd) { |
| switch (cmd->type) { |
| case PVR_DEFERRED_CS_COMMAND_TYPE_DBSC: { |
| const uint32_t scissor_idx = |
| prim_scissor_elems + cmd->dbsc.state.scissor_index; |
| const uint32_t db_idx = |
| prim_db_elems + cmd->dbsc.state.depthbias_index; |
| const uint32_t num_dwords = |
| pvr_cmd_length(TA_STATE_HEADER) + pvr_cmd_length(TA_STATE_ISPDBSC); |
| struct pvr_suballoc_bo *suballoc_bo; |
| uint32_t ppp_state[num_dwords]; |
| VkResult result; |
| |
| pvr_csb_pack (&ppp_state[0], TA_STATE_HEADER, header) { |
| header.pres_ispctl_dbsc = true; |
| } |
| |
| pvr_csb_pack (&ppp_state[1], TA_STATE_ISPDBSC, ispdbsc) { |
| ispdbsc.dbindex = db_idx; |
| ispdbsc.scindex = scissor_idx; |
| } |
| |
| result = pvr_cmd_buffer_upload_general(cmd_buffer, |
| &ppp_state[0], |
| sizeof(ppp_state), |
| &suballoc_bo); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| pvr_csb_pack (&cmd->dbsc.vdm_state[0], VDMCTRL_PPP_STATE0, state) { |
| state.word_count = num_dwords; |
| state.addrmsb = suballoc_bo->dev_addr; |
| } |
| |
| pvr_csb_pack (&cmd->dbsc.vdm_state[1], VDMCTRL_PPP_STATE1, state) { |
| state.addrlsb = suballoc_bo->dev_addr; |
| } |
| |
| break; |
| } |
| |
| case PVR_DEFERRED_CS_COMMAND_TYPE_DBSC2: { |
| const uint32_t scissor_idx = |
| prim_scissor_elems + cmd->dbsc2.state.scissor_index; |
| const uint32_t db_idx = |
| prim_db_elems + cmd->dbsc2.state.depthbias_index; |
| |
| uint32_t *const addr = |
| (uint32_t *)pvr_bo_suballoc_get_map_addr(cmd->dbsc2.ppp_cs_bo) + |
| cmd->dbsc2.patch_offset; |
| |
| assert(pvr_bo_suballoc_get_map_addr(cmd->dbsc2.ppp_cs_bo)); |
| |
| pvr_csb_pack (addr, TA_STATE_ISPDBSC, ispdbsc) { |
| ispdbsc.dbindex = db_idx; |
| ispdbsc.scindex = scissor_idx; |
| } |
| |
| break; |
| } |
| |
| default: |
| unreachable("Invalid deferred control stream command type."); |
| break; |
| } |
| } |
| |
| util_dynarray_append_dynarray(&cmd_buffer->depth_bias_array, |
| &sec_cmd_buffer->depth_bias_array); |
| |
| util_dynarray_append_dynarray(&cmd_buffer->scissor_array, |
| &sec_cmd_buffer->scissor_array); |
| |
| BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS); |
| cmd_buffer->scissor_words = (struct pvr_scissor_words){ 0 }; |
| |
| return VK_SUCCESS; |
| } |
| |
| /* Caller needs to make sure that it ends the current sub_cmd. This function |
| * only creates a copy of sec_sub_cmd and links it to the cmd_buffer's |
| * sub_cmd list. |
| */ |
| static VkResult pvr_execute_sub_cmd(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd *sec_sub_cmd) |
| { |
| struct pvr_sub_cmd *primary_sub_cmd = |
| vk_zalloc(&cmd_buffer->vk.pool->alloc, |
| sizeof(*primary_sub_cmd), |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); |
| if (!primary_sub_cmd) { |
| return vk_command_buffer_set_error(&cmd_buffer->vk, |
| VK_ERROR_OUT_OF_HOST_MEMORY); |
| } |
| |
| primary_sub_cmd->type = sec_sub_cmd->type; |
| primary_sub_cmd->owned = false; |
| |
| list_addtail(&primary_sub_cmd->link, &cmd_buffer->sub_cmds); |
| |
| switch (sec_sub_cmd->type) { |
| case PVR_SUB_CMD_TYPE_GRAPHICS: |
| primary_sub_cmd->gfx = sec_sub_cmd->gfx; |
| break; |
| |
| case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY: |
| case PVR_SUB_CMD_TYPE_COMPUTE: |
| primary_sub_cmd->compute = sec_sub_cmd->compute; |
| break; |
| |
| case PVR_SUB_CMD_TYPE_TRANSFER: |
| primary_sub_cmd->transfer = sec_sub_cmd->transfer; |
| break; |
| |
| case PVR_SUB_CMD_TYPE_EVENT: |
| primary_sub_cmd->event = sec_sub_cmd->event; |
| break; |
| |
| default: |
| unreachable("Unsupported sub-command type"); |
| } |
| |
| return VK_SUCCESS; |
| } |
| |
| static VkResult |
| pvr_execute_graphics_cmd_buffer(struct pvr_cmd_buffer *cmd_buffer, |
| const struct pvr_cmd_buffer *sec_cmd_buffer) |
| { |
| const struct pvr_device_info *dev_info = |
| &cmd_buffer->device->pdevice->dev_info; |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| struct pvr_sub_cmd *primary_sub_cmd = state->current_sub_cmd; |
| struct pvr_sub_cmd *first_sec_cmd; |
| VkResult result; |
| |
| /* Inherited queries are not supported. */ |
| assert(!state->vis_test_enabled); |
| |
| if (list_is_empty(&sec_cmd_buffer->sub_cmds)) |
| return VK_SUCCESS; |
| |
| first_sec_cmd = |
| list_first_entry(&sec_cmd_buffer->sub_cmds, struct pvr_sub_cmd, link); |
| |
| /* Kick a render if we have a new base address. */ |
| if (primary_sub_cmd->gfx.query_pool && first_sec_cmd->gfx.query_pool && |
| primary_sub_cmd->gfx.query_pool != first_sec_cmd->gfx.query_pool) { |
| state->current_sub_cmd->gfx.barrier_store = true; |
| |
| result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| result = |
| pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| primary_sub_cmd = state->current_sub_cmd; |
| |
| /* Use existing render setup, but load color attachments from HW |
| * Background object. |
| */ |
| primary_sub_cmd->gfx.barrier_load = true; |
| primary_sub_cmd->gfx.barrier_store = false; |
| } |
| |
| list_for_each_entry (struct pvr_sub_cmd, |
| sec_sub_cmd, |
| &sec_cmd_buffer->sub_cmds, |
| link) { |
| /* Only graphics secondary execution supported within a renderpass. */ |
| assert(sec_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS); |
| |
| if (!sec_sub_cmd->gfx.empty_cmd) |
| primary_sub_cmd->gfx.empty_cmd = false; |
| |
| if (sec_sub_cmd->gfx.query_pool) { |
| primary_sub_cmd->gfx.query_pool = sec_sub_cmd->gfx.query_pool; |
| |
| util_dynarray_append_dynarray(&state->query_indices, |
| &sec_sub_cmd->gfx.sec_query_indices); |
| } |
| |
| if (pvr_cmd_uses_deferred_cs_cmds(sec_cmd_buffer)) { |
| /* TODO: In case if secondary buffer is created with |
| * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT, then we patch the |
| * stream and copy it to primary stream using pvr_csb_copy below. |
| * This will need locking if the same secondary command buffer is |
| * executed in multiple primary buffers at the same time. |
| */ |
| result = pvr_execute_deferred_cmd_buffer(cmd_buffer, sec_cmd_buffer); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| result = pvr_csb_copy(&primary_sub_cmd->gfx.control_stream, |
| &sec_sub_cmd->gfx.control_stream); |
| if (result != VK_SUCCESS) |
| return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result); |
| } else { |
| result = pvr_execute_deferred_cmd_buffer(cmd_buffer, sec_cmd_buffer); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| pvr_csb_emit_link( |
| &primary_sub_cmd->gfx.control_stream, |
| pvr_csb_get_start_address(&sec_sub_cmd->gfx.control_stream), |
| true); |
| } |
| |
| if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info, |
| compute_overlap)) { |
| primary_sub_cmd->gfx.job.disable_compute_overlap |= |
| sec_sub_cmd->gfx.job.disable_compute_overlap; |
| } |
| |
| primary_sub_cmd->gfx.max_tiles_in_flight = |
| MIN2(primary_sub_cmd->gfx.max_tiles_in_flight, |
| sec_sub_cmd->gfx.max_tiles_in_flight); |
| |
| /* Pass loaded depth/stencil usage from secondary command buffer. */ |
| if (sec_sub_cmd->gfx.depth_usage == PVR_DEPTH_STENCIL_USAGE_NEEDED) |
| primary_sub_cmd->gfx.depth_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED; |
| |
| if (sec_sub_cmd->gfx.stencil_usage == PVR_DEPTH_STENCIL_USAGE_NEEDED) |
| primary_sub_cmd->gfx.stencil_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED; |
| |
| /* Pass depth/stencil modification state from secondary command buffer. */ |
| if (sec_sub_cmd->gfx.modifies_depth) |
| primary_sub_cmd->gfx.modifies_depth = true; |
| |
| if (sec_sub_cmd->gfx.modifies_stencil) |
| primary_sub_cmd->gfx.modifies_stencil = true; |
| |
| if (sec_sub_cmd->gfx.barrier_store) { |
| struct pvr_sub_cmd *sec_next = |
| list_entry(sec_sub_cmd->link.next, struct pvr_sub_cmd, link); |
| |
| /* This shouldn't be the last sub cmd. There should be a barrier load |
| * subsequent to the barrier store. |
| */ |
| assert(list_last_entry(&sec_cmd_buffer->sub_cmds, |
| struct pvr_sub_cmd, |
| link) != sec_sub_cmd); |
| |
| /* Kick render to store stencil. */ |
| state->current_sub_cmd->gfx.barrier_store = true; |
| state->current_sub_cmd->gfx.empty_cmd = false; |
| |
| result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| result = |
| pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| primary_sub_cmd = state->current_sub_cmd; |
| |
| /* Use existing render setup, but load color attachments from HW |
| * Background object. |
| */ |
| primary_sub_cmd->gfx.barrier_load = sec_next->gfx.barrier_load; |
| primary_sub_cmd->gfx.barrier_store = sec_next->gfx.barrier_store; |
| primary_sub_cmd->gfx.empty_cmd = false; |
| } |
| |
| if (!PVR_HAS_FEATURE(dev_info, gs_rta_support)) { |
| util_dynarray_append_dynarray(&cmd_buffer->deferred_clears, |
| &sec_cmd_buffer->deferred_clears); |
| } |
| } |
| |
| return VK_SUCCESS; |
| } |
| |
| void pvr_CmdExecuteCommands(VkCommandBuffer commandBuffer, |
| uint32_t commandBufferCount, |
| const VkCommandBuffer *pCommandBuffers) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| struct pvr_cmd_buffer *last_cmd_buffer; |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); |
| |
| /* Reset the CPU copy of the most recent PPP state of the primary command |
| * buffer. |
| * |
| * The next draw call in the primary after CmdExecuteCommands may send |
| * redundant state, if it all goes in the same geom job. |
| * |
| * Can't just copy state from the secondary because the recording state of |
| * the secondary command buffers would have been deleted at this point. |
| */ |
| pvr_reset_graphics_dirty_state(cmd_buffer, false); |
| |
| if (state->current_sub_cmd && |
| state->current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS) { |
| for (uint32_t i = 0; i < commandBufferCount; i++) { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, sec_cmd_buffer, pCommandBuffers[i]); |
| |
| assert(sec_cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); |
| |
| result = pvr_execute_graphics_cmd_buffer(cmd_buffer, sec_cmd_buffer); |
| if (result != VK_SUCCESS) |
| return; |
| } |
| |
| last_cmd_buffer = |
| pvr_cmd_buffer_from_handle(pCommandBuffers[commandBufferCount - 1]); |
| |
| /* Set barriers from final command secondary command buffer. */ |
| for (uint32_t i = 0; i != PVR_NUM_SYNC_PIPELINE_STAGES; i++) { |
| state->barriers_needed[i] |= |
| last_cmd_buffer->state.barriers_needed[i] & |
| PVR_PIPELINE_STAGE_ALL_GRAPHICS_BITS; |
| } |
| } else { |
| for (uint32_t i = 0; i < commandBufferCount; i++) { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, sec_cmd_buffer, pCommandBuffers[i]); |
| |
| assert(sec_cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); |
| |
| result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return; |
| |
| list_for_each_entry_safe (struct pvr_sub_cmd, |
| sec_sub_cmd, |
| &sec_cmd_buffer->sub_cmds, |
| link) { |
| result = pvr_execute_sub_cmd(cmd_buffer, sec_sub_cmd); |
| if (result != VK_SUCCESS) |
| return; |
| } |
| } |
| |
| last_cmd_buffer = |
| pvr_cmd_buffer_from_handle(pCommandBuffers[commandBufferCount - 1]); |
| |
| memcpy(state->barriers_needed, |
| last_cmd_buffer->state.barriers_needed, |
| sizeof(state->barriers_needed)); |
| } |
| } |
| |
| static void pvr_insert_transparent_obj(struct pvr_cmd_buffer *const cmd_buffer, |
| struct pvr_sub_cmd_gfx *const sub_cmd) |
| { |
| struct pvr_device *const device = cmd_buffer->device; |
| /* Yes we want a copy. The user could be recording multiple command buffers |
| * in parallel so writing the template in place could cause problems. |
| */ |
| struct pvr_static_clear_ppp_template clear = |
| device->static_clear_state.ppp_templates[VK_IMAGE_ASPECT_COLOR_BIT]; |
| uint32_t pds_state[PVR_STATIC_CLEAR_PDS_STATE_COUNT] = { 0 }; |
| struct pvr_csb *csb = &sub_cmd->control_stream; |
| struct pvr_suballoc_bo *ppp_bo; |
| |
| assert(clear.requires_pds_state); |
| |
| /* Patch the template. */ |
| |
| pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SHADERBASE], |
| TA_STATE_PDS_SHADERBASE, |
| shaderbase) { |
| shaderbase.addr = PVR_DEV_ADDR(device->nop_program.pds.data_offset); |
| } |
| |
| clear.config.pds_state = &pds_state; |
| |
| clear.config.ispctl.upass = cmd_buffer->state.render_pass_info.isp_userpass; |
| |
| /* Emit PPP state from template. */ |
| |
| pvr_emit_ppp_from_template(csb, &clear, &ppp_bo); |
| list_add(&ppp_bo->link, &cmd_buffer->bo_list); |
| |
| /* Emit VDM state. */ |
| |
| pvr_emit_clear_words(cmd_buffer, sub_cmd); |
| |
| /* Reset graphics state. */ |
| pvr_reset_graphics_dirty_state(cmd_buffer, false); |
| } |
| |
| static inline struct pvr_render_subpass * |
| pvr_get_current_subpass(const struct pvr_cmd_buffer_state *const state) |
| { |
| const uint32_t subpass_idx = state->render_pass_info.subpass_idx; |
| |
| return &state->render_pass_info.pass->subpasses[subpass_idx]; |
| } |
| |
| void pvr_CmdNextSubpass2(VkCommandBuffer commandBuffer, |
| const VkSubpassBeginInfo *pSubpassBeginInfo, |
| const VkSubpassEndInfo *pSubpassEndInfo) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| struct pvr_render_pass_info *rp_info = &state->render_pass_info; |
| const struct pvr_renderpass_hwsetup_subpass *hw_subpass; |
| struct pvr_renderpass_hwsetup_render *next_hw_render; |
| const struct pvr_render_pass *pass = rp_info->pass; |
| const struct pvr_renderpass_hw_map *current_map; |
| const struct pvr_renderpass_hw_map *next_map; |
| struct pvr_load_op *hw_subpass_load_op; |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| current_map = &pass->hw_setup->subpass_map[rp_info->subpass_idx]; |
| next_map = &pass->hw_setup->subpass_map[rp_info->subpass_idx + 1]; |
| next_hw_render = &pass->hw_setup->renders[next_map->render]; |
| |
| if (current_map->render != next_map->render) { |
| result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return; |
| |
| result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer, rp_info); |
| if (result != VK_SUCCESS) |
| return; |
| |
| rp_info->current_hw_subpass = next_map->render; |
| |
| result = |
| pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS); |
| if (result != VK_SUCCESS) |
| return; |
| |
| rp_info->enable_bg_tag = false; |
| rp_info->process_empty_tiles = false; |
| |
| /* If this subpass contains any load ops the HW Background Object must be |
| * run to do the clears/loads. |
| */ |
| if (next_hw_render->color_init_count > 0) { |
| rp_info->enable_bg_tag = true; |
| |
| for (uint32_t i = 0; i < next_hw_render->color_init_count; i++) { |
| /* Empty tiles need to be cleared too. */ |
| if (next_hw_render->color_init[i].op == |
| VK_ATTACHMENT_LOAD_OP_CLEAR) { |
| rp_info->process_empty_tiles = true; |
| break; |
| } |
| } |
| } |
| |
| /* Set isp_userpass to zero for new hw_render. This will be used to set |
| * ROGUE_CR_ISP_CTL::upass_start. |
| */ |
| rp_info->isp_userpass = 0; |
| } |
| |
| hw_subpass = &next_hw_render->subpasses[next_map->subpass]; |
| hw_subpass_load_op = hw_subpass->load_op; |
| |
| if (hw_subpass_load_op) { |
| result = pvr_cs_write_load_op(cmd_buffer, |
| &state->current_sub_cmd->gfx, |
| hw_subpass_load_op, |
| rp_info->isp_userpass); |
| } |
| |
| /* Pipelines are created for a particular subpass so unbind but leave the |
| * vertex and descriptor bindings intact as they are orthogonal to the |
| * subpass. |
| */ |
| state->gfx_pipeline = NULL; |
| |
| /* User-pass spawn is 4 bits so if the driver has to wrap it, it will emit a |
| * full screen transparent object to flush all tags up until now, then the |
| * user-pass spawn value will implicitly be reset to 0 because |
| * pvr_render_subpass::isp_userpass values are stored ANDed with |
| * ROGUE_CR_ISP_CTL_UPASS_START_SIZE_MAX. |
| */ |
| /* If hw_subpass_load_op is valid then pvr_write_load_op_control_stream |
| * has already done a full-screen transparent object. |
| */ |
| if (rp_info->isp_userpass == ROGUE_CR_ISP_CTL_UPASS_START_SIZE_MAX && |
| !hw_subpass_load_op) { |
| pvr_insert_transparent_obj(cmd_buffer, &state->current_sub_cmd->gfx); |
| } |
| |
| rp_info->subpass_idx++; |
| |
| rp_info->isp_userpass = pass->subpasses[rp_info->subpass_idx].isp_userpass; |
| state->dirty.isp_userpass = true; |
| |
| rp_info->pipeline_bind_point = |
| pass->subpasses[rp_info->subpass_idx].pipeline_bind_point; |
| |
| pvr_stash_depth_format(state, &state->current_sub_cmd->gfx); |
| } |
| |
| static bool |
| pvr_stencil_has_self_dependency(const struct pvr_cmd_buffer_state *const state) |
| { |
| const struct pvr_render_subpass *const current_subpass = |
| pvr_get_current_subpass(state); |
| const uint32_t *const input_attachments = current_subpass->input_attachments; |
| |
| if (current_subpass->depth_stencil_attachment == VK_ATTACHMENT_UNUSED) |
| return false; |
| |
| /* We only need to check the current software subpass as we don't support |
| * merging to/from a subpass with self-dep stencil. |
| */ |
| |
| for (uint32_t i = 0; i < current_subpass->input_count; i++) { |
| if (input_attachments[i] == current_subpass->depth_stencil_attachment) |
| return true; |
| } |
| |
| return false; |
| } |
| |
| static bool pvr_is_stencil_store_load_needed( |
| const struct pvr_cmd_buffer *const cmd_buffer, |
| VkPipelineStageFlags2 vk_src_stage_mask, |
| VkPipelineStageFlags2 vk_dst_stage_mask, |
| uint32_t memory_barrier_count, |
| const VkMemoryBarrier2 *const memory_barriers, |
| uint32_t image_barrier_count, |
| const VkImageMemoryBarrier2 *const image_barriers) |
| { |
| const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| const uint32_t fragment_test_stages = |
| VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | |
| VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT; |
| const struct pvr_render_pass *const pass = state->render_pass_info.pass; |
| const struct pvr_renderpass_hwsetup_render *hw_render; |
| struct pvr_image_view **const attachments = |
| state->render_pass_info.attachments; |
| const struct pvr_image_view *attachment; |
| uint32_t hw_render_idx; |
| |
| if (!pass) |
| return false; |
| |
| hw_render_idx = state->current_sub_cmd->gfx.hw_render_idx; |
| hw_render = &pass->hw_setup->renders[hw_render_idx]; |
| |
| if (hw_render->ds_attach_idx == VK_ATTACHMENT_UNUSED) |
| return false; |
| |
| if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { |
| attachment = attachments[hw_render->ds_attach_idx]; |
| } else { |
| assert(!attachments); |
| attachment = NULL; |
| } |
| |
| if (!(vk_src_stage_mask & fragment_test_stages) && |
| vk_dst_stage_mask & VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT) |
| return false; |
| |
| for (uint32_t i = 0; i < memory_barrier_count; i++) { |
| const uint32_t stencil_write_bit = |
| VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; |
| const uint32_t input_attachment_read_bit = |
| VK_ACCESS_INPUT_ATTACHMENT_READ_BIT; |
| |
| if (!(memory_barriers[i].srcAccessMask & stencil_write_bit)) |
| continue; |
| |
| if (!(memory_barriers[i].dstAccessMask & input_attachment_read_bit)) |
| continue; |
| |
| return pvr_stencil_has_self_dependency(state); |
| } |
| |
| for (uint32_t i = 0; i < image_barrier_count; i++) { |
| PVR_FROM_HANDLE(pvr_image, image, image_barriers[i].image); |
| const uint32_t stencil_bit = VK_IMAGE_ASPECT_STENCIL_BIT; |
| |
| if (!(image_barriers[i].subresourceRange.aspectMask & stencil_bit)) |
| continue; |
| |
| if (attachment && image != vk_to_pvr_image(attachment->vk.image)) |
| continue; |
| |
| if (!vk_format_has_stencil(image->vk.format)) |
| continue; |
| |
| return pvr_stencil_has_self_dependency(state); |
| } |
| |
| return false; |
| } |
| |
| static VkResult |
| pvr_cmd_buffer_insert_mid_frag_barrier_event(struct pvr_cmd_buffer *cmd_buffer, |
| uint32_t src_stage_mask, |
| uint32_t dst_stage_mask) |
| { |
| VkResult result; |
| |
| assert(cmd_buffer->state.current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS); |
| |
| cmd_buffer->state.current_sub_cmd->gfx.empty_cmd = false; |
| |
| /* Submit graphics job to store stencil. */ |
| cmd_buffer->state.current_sub_cmd->gfx.barrier_store = true; |
| |
| pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){ |
| .type = PVR_EVENT_TYPE_BARRIER, |
| .barrier = { |
| .wait_for_stage_mask = src_stage_mask, |
| .wait_at_stage_mask = dst_stage_mask, |
| }, |
| }; |
| |
| pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS); |
| |
| /* Use existing render setup, but load color attachments from HW BGOBJ */ |
| cmd_buffer->state.current_sub_cmd->gfx.barrier_load = true; |
| cmd_buffer->state.current_sub_cmd->gfx.barrier_store = false; |
| |
| return VK_SUCCESS; |
| } |
| |
| static VkResult |
| pvr_cmd_buffer_insert_barrier_event(struct pvr_cmd_buffer *cmd_buffer, |
| uint32_t src_stage_mask, |
| uint32_t dst_stage_mask) |
| { |
| VkResult result; |
| |
| result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){ |
| .type = PVR_EVENT_TYPE_BARRIER, |
| .barrier = { |
| .wait_for_stage_mask = src_stage_mask, |
| .wait_at_stage_mask = dst_stage_mask, |
| }, |
| }; |
| |
| return pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| } |
| |
| /* This is just enough to handle vkCmdPipelineBarrier(). |
| * TODO: Complete? |
| */ |
| void pvr_CmdPipelineBarrier2(VkCommandBuffer commandBuffer, |
| const VkDependencyInfo *pDependencyInfo) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| const struct pvr_render_pass *const render_pass = |
| state->render_pass_info.pass; |
| VkPipelineStageFlags vk_src_stage_mask = 0U; |
| VkPipelineStageFlags vk_dst_stage_mask = 0U; |
| bool is_stencil_store_load_needed; |
| uint32_t required_stage_mask = 0U; |
| uint32_t src_stage_mask; |
| uint32_t dst_stage_mask; |
| bool is_barrier_needed; |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++) { |
| vk_src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask; |
| vk_dst_stage_mask |= pDependencyInfo->pMemoryBarriers[i].dstStageMask; |
| } |
| |
| for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++) { |
| vk_src_stage_mask |= |
| pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask; |
| vk_dst_stage_mask |= |
| pDependencyInfo->pBufferMemoryBarriers[i].dstStageMask; |
| } |
| |
| for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) { |
| vk_src_stage_mask |= |
| pDependencyInfo->pImageMemoryBarriers[i].srcStageMask; |
| vk_dst_stage_mask |= |
| pDependencyInfo->pImageMemoryBarriers[i].dstStageMask; |
| } |
| |
| src_stage_mask = pvr_stage_mask_src(vk_src_stage_mask); |
| dst_stage_mask = pvr_stage_mask_dst(vk_dst_stage_mask); |
| |
| for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) { |
| if (!(dst_stage_mask & BITFIELD_BIT(stage))) |
| continue; |
| |
| required_stage_mask |= state->barriers_needed[stage]; |
| } |
| |
| src_stage_mask &= required_stage_mask; |
| for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) { |
| if (!(dst_stage_mask & BITFIELD_BIT(stage))) |
| continue; |
| |
| state->barriers_needed[stage] &= ~src_stage_mask; |
| } |
| |
| if (src_stage_mask == 0 || dst_stage_mask == 0) { |
| is_barrier_needed = false; |
| } else if (src_stage_mask == PVR_PIPELINE_STAGE_GEOM_BIT && |
| dst_stage_mask == PVR_PIPELINE_STAGE_FRAG_BIT) { |
| /* This is implicit so no need to barrier. */ |
| is_barrier_needed = false; |
| } else if (src_stage_mask == dst_stage_mask && |
| util_bitcount(src_stage_mask) == 1) { |
| struct pvr_sub_cmd *const current_sub_cmd = state->current_sub_cmd; |
| |
| switch (src_stage_mask) { |
| case PVR_PIPELINE_STAGE_FRAG_BIT: |
| is_barrier_needed = false; |
| |
| if (!render_pass) |
| break; |
| |
| assert(current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS); |
| |
| /* Flush all fragment work up to this point. */ |
| pvr_insert_transparent_obj(cmd_buffer, ¤t_sub_cmd->gfx); |
| break; |
| |
| case PVR_PIPELINE_STAGE_COMPUTE_BIT: |
| is_barrier_needed = false; |
| |
| if (!current_sub_cmd || |
| current_sub_cmd->type != PVR_SUB_CMD_TYPE_COMPUTE) { |
| break; |
| } |
| |
| /* Multiple dispatches can be merged into a single job. When back to |
| * back dispatches have a sequential dependency (Compute -> compute |
| * pipeline barrier) we need to do the following. |
| * - Dispatch a kernel which fences all previous memory writes and |
| * flushes the MADD cache. |
| * - Issue a compute fence which ensures all previous tasks emitted |
| * by the compute data master are completed before starting |
| * anything new. |
| */ |
| |
| /* Issue Data Fence, Wait for Data Fence (IDFWDF) makes the PDS wait |
| * for data. |
| */ |
| pvr_compute_generate_idfwdf(cmd_buffer, ¤t_sub_cmd->compute); |
| |
| pvr_compute_generate_fence(cmd_buffer, |
| ¤t_sub_cmd->compute, |
| false); |
| break; |
| |
| default: |
| is_barrier_needed = false; |
| break; |
| }; |
| } else { |
| is_barrier_needed = true; |
| } |
| |
| is_stencil_store_load_needed = |
| pvr_is_stencil_store_load_needed(cmd_buffer, |
| vk_src_stage_mask, |
| vk_dst_stage_mask, |
| pDependencyInfo->memoryBarrierCount, |
| pDependencyInfo->pMemoryBarriers, |
| pDependencyInfo->imageMemoryBarrierCount, |
| pDependencyInfo->pImageMemoryBarriers); |
| |
| if (is_stencil_store_load_needed) { |
| assert(render_pass); |
| result = pvr_cmd_buffer_insert_mid_frag_barrier_event(cmd_buffer, |
| src_stage_mask, |
| dst_stage_mask); |
| if (result != VK_SUCCESS) |
| mesa_loge("Failed to insert mid frag barrier event."); |
| } else if (is_barrier_needed) { |
| result = pvr_cmd_buffer_insert_barrier_event(cmd_buffer, |
| src_stage_mask, |
| dst_stage_mask); |
| if (result != VK_SUCCESS) |
| mesa_loge("Failed to insert pipeline barrier event."); |
| } |
| } |
| |
| void pvr_CmdResetEvent2(VkCommandBuffer commandBuffer, |
| VkEvent _event, |
| VkPipelineStageFlags2 stageMask) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| PVR_FROM_HANDLE(pvr_event, event, _event); |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT); |
| if (result != VK_SUCCESS) |
| return; |
| |
| cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){ |
| .type = PVR_EVENT_TYPE_RESET, |
| .set_reset = { |
| .event = event, |
| .wait_for_stage_mask = pvr_stage_mask_src(stageMask), |
| }, |
| }; |
| |
| pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| } |
| |
| void pvr_CmdSetEvent2(VkCommandBuffer commandBuffer, |
| VkEvent _event, |
| const VkDependencyInfo *pDependencyInfo) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| PVR_FROM_HANDLE(pvr_event, event, _event); |
| VkPipelineStageFlags2 stage_mask = 0; |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT); |
| if (result != VK_SUCCESS) |
| return; |
| |
| for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++) |
| stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask; |
| |
| for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++) |
| stage_mask |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask; |
| |
| for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) |
| stage_mask |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask; |
| |
| cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){ |
| .type = PVR_EVENT_TYPE_SET, |
| .set_reset = { |
| .event = event, |
| .wait_for_stage_mask = pvr_stage_mask_dst(stage_mask), |
| }, |
| }; |
| |
| pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| } |
| |
| void pvr_CmdWaitEvents2(VkCommandBuffer commandBuffer, |
| uint32_t eventCount, |
| const VkEvent *pEvents, |
| const VkDependencyInfo *pDependencyInfos) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_event **events_array; |
| uint32_t *stage_masks; |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| VK_MULTIALLOC(ma); |
| vk_multialloc_add(&ma, &events_array, __typeof__(*events_array), eventCount); |
| vk_multialloc_add(&ma, &stage_masks, __typeof__(*stage_masks), eventCount); |
| |
| if (!vk_multialloc_alloc(&ma, |
| &cmd_buffer->vk.pool->alloc, |
| VK_SYSTEM_ALLOCATION_SCOPE_OBJECT)) { |
| vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY); |
| return; |
| } |
| |
| result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT); |
| if (result != VK_SUCCESS) { |
| vk_free(&cmd_buffer->vk.pool->alloc, events_array); |
| return; |
| } |
| |
| memcpy(events_array, pEvents, sizeof(*events_array) * eventCount); |
| |
| for (uint32_t i = 0; i < eventCount; i++) { |
| const VkDependencyInfo *info = &pDependencyInfos[i]; |
| VkPipelineStageFlags2 mask = 0; |
| |
| for (uint32_t j = 0; j < info->memoryBarrierCount; j++) |
| mask |= info->pMemoryBarriers[j].dstStageMask; |
| |
| for (uint32_t j = 0; j < info->bufferMemoryBarrierCount; j++) |
| mask |= info->pBufferMemoryBarriers[j].dstStageMask; |
| |
| for (uint32_t j = 0; j < info->imageMemoryBarrierCount; j++) |
| mask |= info->pImageMemoryBarriers[j].dstStageMask; |
| |
| stage_masks[i] = pvr_stage_mask_dst(mask); |
| } |
| |
| cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){ |
| .type = PVR_EVENT_TYPE_WAIT, |
| .wait = { |
| .count = eventCount, |
| .events = events_array, |
| .wait_at_stage_masks = stage_masks, |
| }, |
| }; |
| |
| pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| } |
| |
| void pvr_CmdWriteTimestamp2(VkCommandBuffer commandBuffer, |
| VkPipelineStageFlags2 stage, |
| VkQueryPool queryPool, |
| uint32_t query) |
| { |
| unreachable("Timestamp queries are not supported."); |
| } |
| |
| VkResult pvr_EndCommandBuffer(VkCommandBuffer commandBuffer) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| VkResult result; |
| |
| if (vk_command_buffer_has_error(&cmd_buffer->vk)) |
| return vk_command_buffer_end(&cmd_buffer->vk); |
| |
| /* TODO: We should be freeing all the resources, allocated for recording, |
| * here. |
| */ |
| util_dynarray_fini(&state->query_indices); |
| |
| result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| if (result != VK_SUCCESS) |
| pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result); |
| |
| return vk_command_buffer_end(&cmd_buffer->vk); |
| } |