| /* |
| * Copyright © 2022 Imagination Technologies Ltd. |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy |
| * of this software and associated documentation files (the "Software"), to deal |
| * in the Software without restriction, including without limitation the rights |
| * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| * copies of the Software, and to permit persons to whom the Software is |
| * furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| */ |
| |
| #include <assert.h> |
| #include <limits.h> |
| #include <stdbool.h> |
| #include <stddef.h> |
| #include <stdint.h> |
| #include <string.h> |
| #include <vulkan/vulkan.h> |
| |
| #include "hwdef/rogue_hw_defs.h" |
| #include "hwdef/rogue_hw_utils.h" |
| #include "pvr_bo.h" |
| #include "pvr_csb.h" |
| #include "pvr_csb_enum_helpers.h" |
| #include "pvr_device_info.h" |
| #include "pvr_end_of_tile.h" |
| #include "pvr_formats.h" |
| #include "pvr_hw_pass.h" |
| #include "pvr_job_common.h" |
| #include "pvr_job_render.h" |
| #include "pvr_limits.h" |
| #include "pvr_pds.h" |
| #include "pvr_private.h" |
| #include "pvr_types.h" |
| #include "pvr_winsys.h" |
| #include "util/bitscan.h" |
| #include "util/compiler.h" |
| #include "util/list.h" |
| #include "util/macros.h" |
| #include "util/u_dynarray.h" |
| #include "util/u_pack_color.h" |
| #include "vk_alloc.h" |
| #include "vk_command_buffer.h" |
| #include "vk_command_pool.h" |
| #include "vk_format.h" |
| #include "vk_log.h" |
| #include "vk_object.h" |
| #include "vk_util.h" |
| |
| /* Structure used to pass data into pvr_compute_generate_control_stream() |
| * function. |
| */ |
| struct pvr_compute_kernel_info { |
| pvr_dev_addr_t indirect_buffer_addr; |
| bool global_offsets_present; |
| uint32_t usc_common_size; |
| uint32_t usc_unified_size; |
| uint32_t pds_temp_size; |
| uint32_t pds_data_size; |
| enum PVRX(CDMCTRL_USC_TARGET) usc_target; |
| bool is_fence; |
| uint32_t pds_data_offset; |
| uint32_t pds_code_offset; |
| enum PVRX(CDMCTRL_SD_TYPE) sd_type; |
| bool usc_common_shared; |
| uint32_t local_size[PVR_WORKGROUP_DIMENSIONS]; |
| uint32_t global_size[PVR_WORKGROUP_DIMENSIONS]; |
| uint32_t max_instances; |
| }; |
| |
| static void pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd *sub_cmd) |
| { |
| if (sub_cmd->owned) { |
| switch (sub_cmd->type) { |
| case PVR_SUB_CMD_TYPE_GRAPHICS: |
| pvr_csb_finish(&sub_cmd->gfx.control_stream); |
| pvr_bo_free(cmd_buffer->device, sub_cmd->gfx.depth_bias_bo); |
| pvr_bo_free(cmd_buffer->device, sub_cmd->gfx.scissor_bo); |
| break; |
| |
| case PVR_SUB_CMD_TYPE_COMPUTE: |
| pvr_csb_finish(&sub_cmd->compute.control_stream); |
| break; |
| |
| case PVR_SUB_CMD_TYPE_TRANSFER: |
| list_for_each_entry_safe (struct pvr_transfer_cmd, |
| transfer_cmd, |
| &sub_cmd->transfer.transfer_cmds, |
| link) { |
| list_del(&transfer_cmd->link); |
| vk_free(&cmd_buffer->vk.pool->alloc, transfer_cmd); |
| } |
| break; |
| |
| case PVR_SUB_CMD_TYPE_EVENT: |
| if (sub_cmd->event.type == PVR_EVENT_TYPE_WAIT) |
| vk_free(&cmd_buffer->vk.pool->alloc, sub_cmd->event.wait.events); |
| break; |
| |
| default: |
| pvr_finishme("Unsupported sub-command type %d", sub_cmd->type); |
| break; |
| } |
| } |
| |
| list_del(&sub_cmd->link); |
| vk_free(&cmd_buffer->vk.pool->alloc, sub_cmd); |
| } |
| |
| static void pvr_cmd_buffer_free_sub_cmds(struct pvr_cmd_buffer *cmd_buffer) |
| { |
| list_for_each_entry_safe (struct pvr_sub_cmd, |
| sub_cmd, |
| &cmd_buffer->sub_cmds, |
| link) { |
| pvr_cmd_buffer_free_sub_cmd(cmd_buffer, sub_cmd); |
| } |
| } |
| |
| static void pvr_cmd_buffer_free_resources(struct pvr_cmd_buffer *cmd_buffer) |
| { |
| vk_free(&cmd_buffer->vk.pool->alloc, |
| cmd_buffer->state.render_pass_info.attachments); |
| vk_free(&cmd_buffer->vk.pool->alloc, |
| cmd_buffer->state.render_pass_info.clear_values); |
| |
| pvr_cmd_buffer_free_sub_cmds(cmd_buffer); |
| |
| list_for_each_entry_safe (struct pvr_bo, bo, &cmd_buffer->bo_list, link) { |
| list_del(&bo->link); |
| pvr_bo_free(cmd_buffer->device, bo); |
| } |
| |
| util_dynarray_fini(&cmd_buffer->deferred_csb_commands); |
| util_dynarray_fini(&cmd_buffer->scissor_array); |
| util_dynarray_fini(&cmd_buffer->depth_bias_array); |
| } |
| |
| static void pvr_cmd_buffer_reset(struct pvr_cmd_buffer *cmd_buffer) |
| { |
| if (cmd_buffer->status != PVR_CMD_BUFFER_STATUS_INITIAL) { |
| /* FIXME: For now we always free all resources as if |
| * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set. |
| */ |
| pvr_cmd_buffer_free_resources(cmd_buffer); |
| |
| vk_command_buffer_reset(&cmd_buffer->vk); |
| |
| memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state)); |
| memset(cmd_buffer->scissor_words, 0, sizeof(cmd_buffer->scissor_words)); |
| |
| cmd_buffer->usage_flags = 0; |
| cmd_buffer->state.status = VK_SUCCESS; |
| cmd_buffer->status = PVR_CMD_BUFFER_STATUS_INITIAL; |
| } |
| } |
| |
| static void pvr_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer) |
| { |
| struct pvr_cmd_buffer *cmd_buffer = |
| container_of(vk_cmd_buffer, struct pvr_cmd_buffer, vk); |
| |
| pvr_cmd_buffer_free_resources(cmd_buffer); |
| vk_command_buffer_finish(&cmd_buffer->vk); |
| vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer); |
| } |
| |
| static const struct vk_command_buffer_ops cmd_buffer_ops = { |
| .destroy = pvr_cmd_buffer_destroy, |
| }; |
| |
| static VkResult pvr_cmd_buffer_create(struct pvr_device *device, |
| struct vk_command_pool *pool, |
| VkCommandBufferLevel level, |
| VkCommandBuffer *pCommandBuffer) |
| { |
| struct pvr_cmd_buffer *cmd_buffer; |
| VkResult result; |
| |
| cmd_buffer = vk_zalloc(&pool->alloc, |
| sizeof(*cmd_buffer), |
| 8U, |
| VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); |
| if (!cmd_buffer) |
| return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| |
| result = |
| vk_command_buffer_init(pool, &cmd_buffer->vk, &cmd_buffer_ops, level); |
| if (result != VK_SUCCESS) { |
| vk_free(&pool->alloc, cmd_buffer); |
| return result; |
| } |
| |
| cmd_buffer->device = device; |
| |
| util_dynarray_init(&cmd_buffer->depth_bias_array, NULL); |
| util_dynarray_init(&cmd_buffer->scissor_array, NULL); |
| util_dynarray_init(&cmd_buffer->deferred_csb_commands, NULL); |
| |
| cmd_buffer->state.status = VK_SUCCESS; |
| cmd_buffer->status = PVR_CMD_BUFFER_STATUS_INITIAL; |
| |
| list_inithead(&cmd_buffer->sub_cmds); |
| list_inithead(&cmd_buffer->bo_list); |
| |
| *pCommandBuffer = pvr_cmd_buffer_to_handle(cmd_buffer); |
| |
| return VK_SUCCESS; |
| } |
| |
| VkResult |
| pvr_AllocateCommandBuffers(VkDevice _device, |
| const VkCommandBufferAllocateInfo *pAllocateInfo, |
| VkCommandBuffer *pCommandBuffers) |
| { |
| VK_FROM_HANDLE(vk_command_pool, pool, pAllocateInfo->commandPool); |
| PVR_FROM_HANDLE(pvr_device, device, _device); |
| VkResult result = VK_SUCCESS; |
| uint32_t i; |
| |
| for (i = 0; i < pAllocateInfo->commandBufferCount; i++) { |
| result = pvr_cmd_buffer_create(device, |
| pool, |
| pAllocateInfo->level, |
| &pCommandBuffers[i]); |
| if (result != VK_SUCCESS) |
| break; |
| } |
| |
| if (result != VK_SUCCESS) { |
| while (i--) { |
| VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, pCommandBuffers[i]); |
| pvr_cmd_buffer_destroy(cmd_buffer); |
| } |
| |
| for (i = 0; i < pAllocateInfo->commandBufferCount; i++) |
| pCommandBuffers[i] = VK_NULL_HANDLE; |
| } |
| |
| return result; |
| } |
| |
| static void pvr_cmd_buffer_update_barriers(struct pvr_cmd_buffer *cmd_buffer, |
| enum pvr_sub_cmd_type type) |
| { |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| uint32_t barriers; |
| |
| switch (type) { |
| case PVR_SUB_CMD_TYPE_GRAPHICS: |
| barriers = PVR_PIPELINE_STAGE_GEOM_BIT | PVR_PIPELINE_STAGE_FRAG_BIT; |
| break; |
| |
| case PVR_SUB_CMD_TYPE_COMPUTE: |
| barriers = PVR_PIPELINE_STAGE_COMPUTE_BIT; |
| break; |
| |
| case PVR_SUB_CMD_TYPE_TRANSFER: |
| barriers = PVR_PIPELINE_STAGE_TRANSFER_BIT; |
| break; |
| |
| case PVR_SUB_CMD_TYPE_EVENT: |
| barriers = 0; |
| break; |
| |
| default: |
| barriers = 0; |
| pvr_finishme("Unsupported sub-command type %d", type); |
| break; |
| } |
| |
| for (uint32_t i = 0; i < ARRAY_SIZE(state->barriers_needed); i++) |
| state->barriers_needed[i] |= barriers; |
| } |
| |
| static VkResult |
| pvr_cmd_buffer_upload_tables(struct pvr_device *device, |
| struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd_gfx *const sub_cmd) |
| { |
| const uint32_t cache_line_size = |
| rogue_get_slc_cache_line_size(&device->pdevice->dev_info); |
| VkResult result; |
| |
| assert(!sub_cmd->depth_bias_bo && !sub_cmd->scissor_bo); |
| |
| if (cmd_buffer->depth_bias_array.size > 0) { |
| result = |
| pvr_gpu_upload(device, |
| device->heaps.general_heap, |
| util_dynarray_begin(&cmd_buffer->depth_bias_array), |
| cmd_buffer->depth_bias_array.size, |
| cache_line_size, |
| &sub_cmd->depth_bias_bo); |
| if (result != VK_SUCCESS) |
| return result; |
| } |
| |
| if (cmd_buffer->scissor_array.size > 0) { |
| result = pvr_gpu_upload(device, |
| device->heaps.general_heap, |
| util_dynarray_begin(&cmd_buffer->scissor_array), |
| cmd_buffer->scissor_array.size, |
| cache_line_size, |
| &sub_cmd->scissor_bo); |
| if (result != VK_SUCCESS) |
| goto err_free_depth_bias_bo; |
| } |
| |
| util_dynarray_clear(&cmd_buffer->depth_bias_array); |
| util_dynarray_clear(&cmd_buffer->scissor_array); |
| |
| return VK_SUCCESS; |
| |
| err_free_depth_bias_bo: |
| pvr_bo_free(device, sub_cmd->depth_bias_bo); |
| sub_cmd->depth_bias_bo = NULL; |
| |
| return result; |
| } |
| |
| static VkResult |
| pvr_cmd_buffer_emit_ppp_state(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd_gfx *const sub_cmd) |
| { |
| struct pvr_framebuffer *framebuffer = |
| cmd_buffer->state.render_pass_info.framebuffer; |
| |
| pvr_csb_emit (&sub_cmd->control_stream, VDMCTRL_PPP_STATE0, state0) { |
| state0.addrmsb = framebuffer->ppp_state_bo->vma->dev_addr; |
| state0.word_count = framebuffer->ppp_state_size; |
| } |
| |
| pvr_csb_emit (&sub_cmd->control_stream, VDMCTRL_PPP_STATE1, state1) { |
| state1.addrlsb = framebuffer->ppp_state_bo->vma->dev_addr; |
| } |
| |
| return VK_SUCCESS; |
| } |
| |
| static VkResult |
| pvr_cmd_buffer_upload_general(struct pvr_cmd_buffer *const cmd_buffer, |
| const void *const data, |
| const size_t size, |
| struct pvr_bo **const pvr_bo_out) |
| { |
| struct pvr_device *const device = cmd_buffer->device; |
| const uint32_t cache_line_size = |
| rogue_get_slc_cache_line_size(&device->pdevice->dev_info); |
| struct pvr_bo *pvr_bo; |
| VkResult result; |
| |
| result = pvr_gpu_upload(device, |
| device->heaps.general_heap, |
| data, |
| size, |
| cache_line_size, |
| &pvr_bo); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| list_add(&pvr_bo->link, &cmd_buffer->bo_list); |
| |
| *pvr_bo_out = pvr_bo; |
| |
| return VK_SUCCESS; |
| } |
| |
| static VkResult |
| pvr_cmd_buffer_upload_usc(struct pvr_cmd_buffer *const cmd_buffer, |
| const void *const code, |
| const size_t code_size, |
| uint64_t code_alignment, |
| struct pvr_bo **const pvr_bo_out) |
| { |
| struct pvr_device *const device = cmd_buffer->device; |
| const uint32_t cache_line_size = |
| rogue_get_slc_cache_line_size(&device->pdevice->dev_info); |
| struct pvr_bo *pvr_bo; |
| VkResult result; |
| |
| code_alignment = MAX2(code_alignment, cache_line_size); |
| |
| result = |
| pvr_gpu_upload_usc(device, code, code_size, code_alignment, &pvr_bo); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| list_add(&pvr_bo->link, &cmd_buffer->bo_list); |
| |
| *pvr_bo_out = pvr_bo; |
| |
| return VK_SUCCESS; |
| } |
| |
| static VkResult |
| pvr_cmd_buffer_upload_pds(struct pvr_cmd_buffer *const cmd_buffer, |
| const uint32_t *data, |
| uint32_t data_size_dwords, |
| uint32_t data_alignment, |
| const uint32_t *code, |
| uint32_t code_size_dwords, |
| uint32_t code_alignment, |
| uint64_t min_alignment, |
| struct pvr_pds_upload *const pds_upload_out) |
| { |
| struct pvr_device *const device = cmd_buffer->device; |
| VkResult result; |
| |
| result = pvr_gpu_upload_pds(device, |
| data, |
| data_size_dwords, |
| data_alignment, |
| code, |
| code_size_dwords, |
| code_alignment, |
| min_alignment, |
| pds_upload_out); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| list_add(&pds_upload_out->pvr_bo->link, &cmd_buffer->bo_list); |
| |
| return VK_SUCCESS; |
| } |
| |
| static inline VkResult |
| pvr_cmd_buffer_upload_pds_data(struct pvr_cmd_buffer *const cmd_buffer, |
| const uint32_t *data, |
| uint32_t data_size_dwords, |
| uint32_t data_alignment, |
| struct pvr_pds_upload *const pds_upload_out) |
| { |
| return pvr_cmd_buffer_upload_pds(cmd_buffer, |
| data, |
| data_size_dwords, |
| data_alignment, |
| NULL, |
| 0, |
| 0, |
| data_alignment, |
| pds_upload_out); |
| } |
| |
| static VkResult pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload( |
| struct pvr_cmd_buffer *const cmd_buffer, |
| const uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS], |
| struct pvr_pds_upload *const pds_upload_out) |
| { |
| struct pvr_pds_event_program pixel_event_program = { |
| /* No data to DMA, just a DOUTU needed. */ |
| .num_emit_word_pairs = 0, |
| }; |
| const uint32_t staging_buffer_size = |
| cmd_buffer->device->pixel_event_data_size_in_dwords * sizeof(uint32_t); |
| const VkAllocationCallbacks *const allocator = &cmd_buffer->vk.pool->alloc; |
| struct pvr_device *const device = cmd_buffer->device; |
| /* FIXME: This should come from the compiler for the USC pixel program. */ |
| const uint32_t usc_temp_count = 0; |
| struct pvr_bo *usc_eot_program; |
| uint8_t *usc_eot_program_ptr; |
| uint32_t *staging_buffer; |
| VkResult result; |
| |
| result = pvr_cmd_buffer_upload_usc(cmd_buffer, |
| pvr_end_of_tile_program, |
| sizeof(pvr_end_of_tile_program), |
| 4, |
| &usc_eot_program); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| assert((pbe_cs_words[1] & 0x3F) == 0x20); |
| |
| /* FIXME: Stop patching the framebuffer address (this will require the |
| * end-of-tile program to be generated at run-time). |
| */ |
| pvr_bo_cpu_map(device, usc_eot_program); |
| usc_eot_program_ptr = usc_eot_program->bo->map; |
| usc_eot_program_ptr[6] = (pbe_cs_words[0] >> 0) & 0xFF; |
| usc_eot_program_ptr[7] = (pbe_cs_words[0] >> 8) & 0xFF; |
| usc_eot_program_ptr[8] = (pbe_cs_words[0] >> 16) & 0xFF; |
| usc_eot_program_ptr[9] = (pbe_cs_words[0] >> 24) & 0xFF; |
| pvr_bo_cpu_unmap(device, usc_eot_program); |
| |
| pvr_pds_setup_doutu(&pixel_event_program.task_control, |
| usc_eot_program->vma->dev_addr.addr, |
| usc_temp_count, |
| PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE), |
| false); |
| |
| /* TODO: We could skip allocating this and generate directly into the device |
| * buffer thus removing one allocation and memcpy() per job. Would this |
| * speed up things in a noticeable way? |
| */ |
| staging_buffer = vk_alloc(allocator, |
| staging_buffer_size, |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); |
| if (!staging_buffer) { |
| result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| goto err_free_usc_pixel_program; |
| } |
| |
| /* Generate the data segment. The code segment was uploaded earlier when |
| * setting up the PDS static heap data. |
| */ |
| pvr_pds_generate_pixel_event_data_segment(&pixel_event_program, |
| staging_buffer, |
| &device->pdevice->dev_info); |
| |
| result = pvr_cmd_buffer_upload_pds_data( |
| cmd_buffer, |
| staging_buffer, |
| cmd_buffer->device->pixel_event_data_size_in_dwords, |
| 4, |
| pds_upload_out); |
| if (result != VK_SUCCESS) |
| goto err_free_pixel_event_staging_buffer; |
| |
| vk_free(allocator, staging_buffer); |
| |
| return VK_SUCCESS; |
| |
| err_free_pixel_event_staging_buffer: |
| vk_free(allocator, staging_buffer); |
| |
| err_free_usc_pixel_program: |
| list_del(&usc_eot_program->link); |
| pvr_bo_free(device, usc_eot_program); |
| |
| return result; |
| } |
| |
| static VkResult |
| pvr_load_op_constants_create_and_upload(struct pvr_cmd_buffer *cmd_buffer, |
| const struct pvr_load_op *load_op, |
| pvr_dev_addr_t *const addr_out) |
| { |
| const struct pvr_render_pass_info *render_pass_info = |
| &cmd_buffer->state.render_pass_info; |
| const struct pvr_render_pass *pass = render_pass_info->pass; |
| const struct pvr_renderpass_hwsetup_render *hw_render = load_op->hw_render; |
| const struct pvr_renderpass_colorinit *color_init = |
| &hw_render->color_init[0]; |
| const struct pvr_render_pass_attachment *attachment = |
| &pass->attachments[color_init->index]; |
| const VkClearValue *clear_value = |
| &render_pass_info->clear_values[color_init->index]; |
| uint32_t hw_clear_value[PVR_CLEAR_COLOR_ARRAY_SIZE]; |
| struct pvr_bo *clear_bo; |
| VkResult result; |
| |
| pvr_finishme("Add missing load op data support"); |
| |
| assert(load_op->is_hw_object); |
| assert(hw_render->color_init_count == 1); |
| |
| /* FIXME: add support for VK_ATTACHMENT_LOAD_OP_LOAD. */ |
| assert(color_init->op == VK_ATTACHMENT_LOAD_OP_CLEAR); |
| |
| /* FIXME: do this at the point we store the clear values? */ |
| pvr_get_hw_clear_color(attachment->vk_format, |
| clear_value->color, |
| hw_clear_value); |
| |
| if (vk_format_get_blocksize(attachment->vk_format) > 4) |
| pvr_finishme("Handle clear color greater than 32 bits."); |
| |
| result = pvr_cmd_buffer_upload_general(cmd_buffer, |
| &hw_clear_value[0], |
| sizeof(hw_clear_value), |
| &clear_bo); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| *addr_out = clear_bo->vma->dev_addr; |
| |
| return VK_SUCCESS; |
| } |
| |
| static VkResult pvr_load_op_pds_data_create_and_upload( |
| struct pvr_cmd_buffer *cmd_buffer, |
| const struct pvr_load_op *load_op, |
| pvr_dev_addr_t constants_addr, |
| struct pvr_pds_upload *const pds_upload_out) |
| { |
| struct pvr_device *device = cmd_buffer->device; |
| const struct pvr_device_info *dev_info = &device->pdevice->dev_info; |
| struct pvr_pds_pixel_shader_sa_program program = { 0 }; |
| uint32_t staging_buffer_size; |
| uint32_t *staging_buffer; |
| VkResult result; |
| |
| program.num_texture_dma_kicks = 1; |
| |
| pvr_csb_pack (&program.texture_dma_address[0], |
| PDSINST_DOUT_FIELDS_DOUTD_SRC0, |
| value) { |
| value.sbase = constants_addr; |
| } |
| |
| pvr_csb_pack (&program.texture_dma_control[0], |
| PDSINST_DOUT_FIELDS_DOUTD_SRC1, |
| value) { |
| value.dest = PVRX(PDSINST_DOUTD_DEST_COMMON_STORE); |
| value.a0 = load_op->shareds_dest_offset; |
| value.bsize = load_op->shareds_count; |
| } |
| |
| pvr_pds_set_sizes_pixel_shader_sa_texture_data(&program, dev_info); |
| |
| staging_buffer_size = program.data_size * sizeof(*staging_buffer); |
| |
| staging_buffer = vk_alloc(&cmd_buffer->vk.pool->alloc, |
| staging_buffer_size, |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); |
| if (!staging_buffer) |
| return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| |
| pvr_pds_generate_pixel_shader_sa_texture_state_data(&program, |
| staging_buffer, |
| dev_info); |
| |
| result = pvr_cmd_buffer_upload_pds_data(cmd_buffer, |
| staging_buffer, |
| program.data_size, |
| 1, |
| pds_upload_out); |
| if (result != VK_SUCCESS) { |
| vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer); |
| return result; |
| } |
| |
| vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer); |
| |
| return VK_SUCCESS; |
| } |
| |
| /* FIXME: Should this function be specific to the HW background object, in |
| * which case its name should be changed, or should it have the load op |
| * structure passed in? |
| */ |
| static VkResult |
| pvr_load_op_data_create_and_upload(struct pvr_cmd_buffer *cmd_buffer, |
| const struct pvr_load_op *load_op, |
| struct pvr_pds_upload *const pds_upload_out) |
| { |
| pvr_dev_addr_t constants_addr; |
| VkResult result; |
| |
| result = pvr_load_op_constants_create_and_upload(cmd_buffer, |
| load_op, |
| &constants_addr); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| return pvr_load_op_pds_data_create_and_upload(cmd_buffer, |
| load_op, |
| constants_addr, |
| pds_upload_out); |
| } |
| |
| static void pvr_pds_bgnd_pack_state( |
| const struct pvr_load_op *load_op, |
| const struct pvr_pds_upload *load_op_program, |
| uint64_t pds_reg_values[static const ROGUE_NUM_CR_PDS_BGRND_WORDS]) |
| { |
| pvr_csb_pack (&pds_reg_values[0], CR_PDS_BGRND0_BASE, value) { |
| value.shader_addr = PVR_DEV_ADDR(load_op->pds_frag_prog.data_offset); |
| value.texunicode_addr = |
| PVR_DEV_ADDR(load_op->pds_tex_state_prog.code_offset); |
| } |
| |
| pvr_csb_pack (&pds_reg_values[1], CR_PDS_BGRND1_BASE, value) { |
| value.texturedata_addr = PVR_DEV_ADDR(load_op_program->data_offset); |
| } |
| |
| pvr_csb_pack (&pds_reg_values[2], CR_PDS_BGRND3_SIZEINFO, value) { |
| value.usc_sharedsize = |
| DIV_ROUND_UP(load_op->const_shareds_count, |
| PVRX(CR_PDS_BGRND3_SIZEINFO_USC_SHAREDSIZE_UNIT_SIZE)); |
| value.pds_texturestatesize = DIV_ROUND_UP( |
| load_op_program->data_size, |
| PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEXTURESTATESIZE_UNIT_SIZE)); |
| value.pds_tempsize = |
| DIV_ROUND_UP(load_op->temps_count, |
| PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEMPSIZE_UNIT_SIZE)); |
| } |
| } |
| |
| /** |
| * \brief Calculates the stride in pixels based on the pitch in bytes and pixel |
| * format. |
| * |
| * \param[in] pitch Width pitch in bytes. |
| * \param[in] vk_format Vulkan image format. |
| * \return Stride in pixels. |
| */ |
| static inline uint32_t pvr_stride_from_pitch(uint32_t pitch, VkFormat vk_format) |
| { |
| const unsigned int cpp = vk_format_get_blocksize(vk_format); |
| |
| assert(pitch % cpp == 0); |
| |
| return pitch / cpp; |
| } |
| |
| static void pvr_setup_pbe_state( |
| const struct pvr_device_info *dev_info, |
| struct pvr_framebuffer *framebuffer, |
| uint32_t mrt_index, |
| const struct usc_mrt_resource *mrt_resource, |
| const struct pvr_image_view *const iview, |
| const VkRect2D *render_area, |
| const bool down_scale, |
| const uint32_t samples, |
| uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS], |
| uint64_t pbe_reg_words[static const ROGUE_NUM_PBESTATE_REG_WORDS]) |
| { |
| const struct pvr_image *image = vk_to_pvr_image(iview->vk.image); |
| uint32_t level_pitch = image->mip_levels[iview->vk.base_mip_level].pitch; |
| |
| struct pvr_pbe_surf_params surface_params; |
| struct pvr_pbe_render_params render_params; |
| bool with_packed_usc_channel; |
| const uint8_t *swizzle; |
| uint32_t position; |
| |
| /* down_scale should be true when performing a resolve, in which case there |
| * should be more than one sample. |
| */ |
| assert((down_scale && samples > 1U) || (!down_scale && samples == 1U)); |
| |
| /* Setup surface parameters. */ |
| |
| if (PVR_HAS_FEATURE(dev_info, usc_f16sop_u8)) { |
| switch (iview->vk.format) { |
| case VK_FORMAT_B8G8R8A8_UNORM: |
| with_packed_usc_channel = true; |
| break; |
| case VK_FORMAT_D32_SFLOAT: |
| with_packed_usc_channel = false; |
| break; |
| default: |
| unreachable("Unsupported Vulkan image format"); |
| } |
| } else { |
| with_packed_usc_channel = false; |
| } |
| |
| swizzle = pvr_get_format_swizzle(iview->vk.format); |
| memcpy(surface_params.swizzle, swizzle, sizeof(surface_params.swizzle)); |
| |
| pvr_pbe_get_src_format_and_gamma(iview->vk.format, |
| PVR_PBE_GAMMA_NONE, |
| with_packed_usc_channel, |
| &surface_params.source_format, |
| &surface_params.gamma); |
| |
| surface_params.is_normalized = vk_format_is_normalized(iview->vk.format); |
| surface_params.pbe_packmode = pvr_get_pbe_packmode(iview->vk.format); |
| surface_params.nr_components = vk_format_get_nr_components(iview->vk.format); |
| |
| /* FIXME: Should we have an inline function to return the address of a mip |
| * level? |
| */ |
| surface_params.addr = |
| PVR_DEV_ADDR_OFFSET(image->vma->dev_addr, |
| image->mip_levels[iview->vk.base_mip_level].offset); |
| |
| surface_params.mem_layout = image->memlayout; |
| surface_params.stride = pvr_stride_from_pitch(level_pitch, iview->vk.format); |
| surface_params.depth = iview->vk.extent.depth; |
| surface_params.width = iview->vk.extent.width; |
| surface_params.height = iview->vk.extent.height; |
| surface_params.z_only_render = false; |
| surface_params.down_scale = down_scale; |
| surface_params.msaa_mode = samples; |
| |
| /* Setup render parameters. */ |
| |
| if (mrt_resource->type == USC_MRT_RESOURCE_TYPE_MEMORY) { |
| position = mrt_resource->mem.offset_dw; |
| } else { |
| assert(mrt_resource->type == USC_MRT_RESOURCE_TYPE_OUTPUT_REG); |
| assert(mrt_resource->reg.offset == 0); |
| |
| position = mrt_resource->reg.output_reg; |
| } |
| |
| assert(position <= 3 || PVR_HAS_FEATURE(dev_info, eight_output_registers)); |
| |
| switch (position) { |
| case 0: |
| case 4: |
| render_params.source_start = PVR_PBE_STARTPOS_BIT0; |
| break; |
| case 1: |
| case 5: |
| render_params.source_start = PVR_PBE_STARTPOS_BIT32; |
| break; |
| case 2: |
| case 6: |
| render_params.source_start = PVR_PBE_STARTPOS_BIT64; |
| break; |
| case 3: |
| case 7: |
| render_params.source_start = PVR_PBE_STARTPOS_BIT96; |
| break; |
| default: |
| assert(!"Invalid output register"); |
| break; |
| } |
| |
| render_params.min_x_clip = MAX2(0, render_area->offset.x); |
| render_params.min_y_clip = MAX2(0, render_area->offset.y); |
| render_params.max_x_clip = |
| MIN2(framebuffer->width, |
| render_area->offset.x + render_area->extent.width) - |
| 1; |
| render_params.max_y_clip = |
| MIN2(framebuffer->height, |
| render_area->offset.y + render_area->extent.height) - |
| 1; |
| |
| render_params.slice = 0; |
| render_params.mrt_index = mrt_index; |
| |
| pvr_pbe_pack_state(dev_info, |
| &surface_params, |
| &render_params, |
| pbe_cs_words, |
| pbe_reg_words); |
| } |
| |
| static struct pvr_render_target * |
| pvr_get_render_target(const struct pvr_render_pass *pass, |
| const struct pvr_framebuffer *framebuffer, |
| uint32_t idx) |
| { |
| const struct pvr_renderpass_hwsetup_render *hw_render = |
| &pass->hw_setup->renders[idx]; |
| uint32_t rt_idx = 0; |
| |
| switch (hw_render->sample_count) { |
| case 1: |
| case 2: |
| case 4: |
| case 8: |
| rt_idx = util_logbase2(hw_render->sample_count); |
| break; |
| |
| default: |
| unreachable("Unsupported sample count"); |
| break; |
| } |
| |
| return &framebuffer->render_targets[rt_idx]; |
| } |
| |
| static uint32_t |
| pvr_pass_get_pixel_output_width(const struct pvr_render_pass *pass, |
| uint32_t idx, |
| const struct pvr_device_info *dev_info) |
| { |
| const struct pvr_renderpass_hwsetup_render *hw_render = |
| &pass->hw_setup->renders[idx]; |
| /* Default value based on the maximum value found in all existing cores. The |
| * maximum is used as this is being treated as a lower bound, making it a |
| * "safer" choice than the minimum value found in all existing cores. |
| */ |
| const uint32_t min_output_regs = |
| PVR_GET_FEATURE_VALUE(dev_info, usc_min_output_registers_per_pix, 2U); |
| const uint32_t width = MAX2(hw_render->output_regs_count, min_output_regs); |
| |
| return util_next_power_of_two(width); |
| } |
| |
| static VkResult pvr_sub_cmd_gfx_job_init(const struct pvr_device_info *dev_info, |
| struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd_gfx *sub_cmd) |
| { |
| struct pvr_render_pass_info *render_pass_info = |
| &cmd_buffer->state.render_pass_info; |
| const struct pvr_renderpass_hwsetup_render *hw_render = |
| &render_pass_info->pass->hw_setup->renders[sub_cmd->hw_render_idx]; |
| struct pvr_render_job *job = &sub_cmd->job; |
| struct pvr_pds_upload pds_pixel_event_program; |
| uint32_t pbe_cs_words[PVR_MAX_COLOR_ATTACHMENTS] |
| [ROGUE_NUM_PBESTATE_STATE_WORDS] = { 0 }; |
| struct pvr_render_target *render_target; |
| VkResult result; |
| |
| assert(hw_render->eot_surface_count < ARRAY_SIZE(pbe_cs_words)); |
| |
| for (uint32_t i = 0; i < hw_render->eot_surface_count; i++) { |
| const struct pvr_renderpass_hwsetup_eot_surface *surface = |
| &hw_render->eot_surfaces[i]; |
| const struct pvr_image_view *iview = |
| render_pass_info->attachments[surface->attachment_idx]; |
| const struct usc_mrt_resource *mrt_resource = |
| &hw_render->eot_setup.mrt_resources[surface->mrt_idx]; |
| uint32_t samples = 1; |
| |
| if (surface->need_resolve) { |
| const struct pvr_image_view *resolve_src = |
| render_pass_info->attachments[surface->src_attachment_idx]; |
| |
| /* Attachments that are the destination of resolve operations must be |
| * loaded before their next use. |
| */ |
| render_pass_info->enable_bg_tag = true; |
| render_pass_info->process_empty_tiles = true; |
| |
| if (surface->resolve_type != PVR_RESOLVE_TYPE_PBE) |
| continue; |
| |
| samples = (uint32_t)resolve_src->vk.image->samples; |
| } |
| |
| pvr_setup_pbe_state(dev_info, |
| render_pass_info->framebuffer, |
| surface->mrt_idx, |
| mrt_resource, |
| iview, |
| &render_pass_info->render_area, |
| surface->need_resolve, |
| samples, |
| pbe_cs_words[i], |
| job->pbe_reg_words[i]); |
| } |
| |
| /* FIXME: The fragment program only supports a single surface at present. */ |
| assert(hw_render->eot_surface_count == 1); |
| result = pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload( |
| cmd_buffer, |
| pbe_cs_words[0], |
| &pds_pixel_event_program); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| job->pds_pixel_event_data_offset = pds_pixel_event_program.data_offset; |
| |
| /* FIXME: Don't do this if there is a barrier load. */ |
| if (render_pass_info->enable_bg_tag) { |
| const struct pvr_load_op *load_op = hw_render->load_op; |
| struct pvr_pds_upload load_op_program; |
| |
| /* FIXME: Should we free the PDS pixel event data or let it be freed |
| * when the pool gets emptied? |
| */ |
| result = pvr_load_op_data_create_and_upload(cmd_buffer, |
| load_op, |
| &load_op_program); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| pvr_pds_bgnd_pack_state(load_op, |
| &load_op_program, |
| job->pds_bgnd_reg_values); |
| } |
| |
| job->enable_bg_tag = render_pass_info->enable_bg_tag; |
| job->process_empty_tiles = render_pass_info->process_empty_tiles; |
| |
| render_target = pvr_get_render_target(render_pass_info->pass, |
| render_pass_info->framebuffer, |
| sub_cmd->hw_render_idx); |
| job->rt_dataset = render_target->rt_dataset; |
| |
| job->ctrl_stream_addr = pvr_csb_get_start_address(&sub_cmd->control_stream); |
| |
| /* FIXME: Need to set up the border color table at device creation |
| * time. Set to invalid for the time being. |
| */ |
| job->border_colour_table_addr = PVR_DEV_ADDR_INVALID; |
| |
| if (sub_cmd->depth_bias_bo) |
| job->depth_bias_table_addr = sub_cmd->depth_bias_bo->vma->dev_addr; |
| else |
| job->depth_bias_table_addr = PVR_DEV_ADDR_INVALID; |
| |
| if (sub_cmd->scissor_bo) |
| job->scissor_table_addr = sub_cmd->scissor_bo->vma->dev_addr; |
| else |
| job->scissor_table_addr = PVR_DEV_ADDR_INVALID; |
| |
| job->pixel_output_width = |
| pvr_pass_get_pixel_output_width(render_pass_info->pass, |
| sub_cmd->hw_render_idx, |
| dev_info); |
| |
| /* Setup depth/stencil job information. */ |
| if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) { |
| struct pvr_image_view *iview = |
| render_pass_info->attachments[hw_render->ds_attach_idx]; |
| const struct pvr_image *image = vk_to_pvr_image(iview->vk.image); |
| |
| if (vk_format_has_depth(image->vk.format)) { |
| uint32_t level_pitch = |
| image->mip_levels[iview->vk.base_mip_level].pitch; |
| |
| /* FIXME: Is this sufficient for depth buffers? */ |
| job->depth_addr = image->dev_addr; |
| |
| job->depth_stride = |
| pvr_stride_from_pitch(level_pitch, iview->vk.format); |
| job->depth_height = iview->vk.extent.height; |
| job->depth_physical_width = |
| u_minify(image->physical_extent.width, iview->vk.base_mip_level); |
| job->depth_physical_height = |
| u_minify(image->physical_extent.height, iview->vk.base_mip_level); |
| job->depth_layer_size = image->layer_size; |
| |
| if (hw_render->ds_attach_idx < render_pass_info->clear_value_count) { |
| VkClearValue *clear_values = |
| &render_pass_info->clear_values[hw_render->ds_attach_idx]; |
| |
| job->depth_clear_value = clear_values->depthStencil.depth; |
| } else { |
| job->depth_clear_value = 1.0f; |
| } |
| |
| job->depth_vk_format = iview->vk.format; |
| |
| job->depth_memlayout = image->memlayout; |
| } else { |
| job->depth_addr = PVR_DEV_ADDR_INVALID; |
| job->depth_stride = 0; |
| job->depth_height = 0; |
| job->depth_physical_width = 0; |
| job->depth_physical_height = 0; |
| job->depth_layer_size = 0; |
| job->depth_clear_value = 1.0f; |
| job->depth_vk_format = VK_FORMAT_UNDEFINED; |
| job->depth_memlayout = PVR_MEMLAYOUT_LINEAR; |
| } |
| |
| if (vk_format_has_stencil(image->vk.format)) { |
| /* FIXME: Is this sufficient for stencil buffers? */ |
| job->stencil_addr = image->dev_addr; |
| } else { |
| job->stencil_addr = PVR_DEV_ADDR_INVALID; |
| } |
| } else { |
| job->depth_addr = PVR_DEV_ADDR_INVALID; |
| job->depth_stride = 0; |
| job->depth_height = 0; |
| job->depth_physical_width = 0; |
| job->depth_physical_height = 0; |
| job->depth_layer_size = 0; |
| job->depth_clear_value = 1.0f; |
| job->depth_vk_format = VK_FORMAT_UNDEFINED; |
| job->depth_memlayout = PVR_MEMLAYOUT_LINEAR; |
| |
| job->stencil_addr = PVR_DEV_ADDR_INVALID; |
| } |
| |
| if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) { |
| struct pvr_image_view *iview = |
| render_pass_info->attachments[hw_render->ds_attach_idx]; |
| const struct pvr_image *image = vk_to_pvr_image(iview->vk.image); |
| |
| /* If the HW render pass has a valid depth/stencil surface, determine the |
| * sample count from the attachment's image. |
| */ |
| job->samples = image->vk.samples; |
| } else if (hw_render->output_regs_count) { |
| /* If the HW render pass has output registers, we have color attachments |
| * to write to, so determine the sample count from the count specified for |
| * every color attachment in this render. |
| */ |
| job->samples = hw_render->sample_count; |
| } else if (cmd_buffer->state.gfx_pipeline) { |
| /* If the HW render pass has no color or depth/stencil attachments, we |
| * determine the sample count from the count given during pipeline |
| * creation. |
| */ |
| job->samples = cmd_buffer->state.gfx_pipeline->rasterization_samples; |
| } else if (render_pass_info->pass->attachment_count > 0) { |
| /* If we get here, we have a render pass with subpasses containing no |
| * attachments. The next best thing is largest of the sample counts |
| * specified by the render pass attachment descriptions. |
| */ |
| job->samples = render_pass_info->pass->max_sample_count; |
| } else { |
| /* No appropriate framebuffer attachment is available. */ |
| mesa_logw("Defaulting render job sample count to 1."); |
| job->samples = VK_SAMPLE_COUNT_1_BIT; |
| } |
| |
| if (sub_cmd->max_tiles_in_flight == |
| PVR_GET_FEATURE_VALUE(dev_info, isp_max_tiles_in_flight, 1U)) { |
| /* Use the default limit based on the partition store. */ |
| job->max_tiles_in_flight = 0U; |
| } else { |
| job->max_tiles_in_flight = sub_cmd->max_tiles_in_flight; |
| } |
| |
| job->frag_uses_atomic_ops = sub_cmd->frag_uses_atomic_ops; |
| job->disable_compute_overlap = false; |
| job->max_shared_registers = cmd_buffer->state.max_shared_regs; |
| job->run_frag = true; |
| job->geometry_terminate = true; |
| |
| return VK_SUCCESS; |
| } |
| |
| /* Number of shareds used in the Issue Data Fence(IDF)/Wait Data Fence(WDF) |
| * kernel. |
| */ |
| #define PVR_IDF_WDF_IN_REGISTER_CONST_COUNT 12U |
| |
| static void |
| pvr_sub_cmd_compute_job_init(const struct pvr_physical_device *pdevice, |
| struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd_compute *sub_cmd) |
| { |
| const struct pvr_device_runtime_info *dev_runtime_info = |
| &pdevice->dev_runtime_info; |
| const struct pvr_device_info *dev_info = &pdevice->dev_info; |
| |
| if (sub_cmd->uses_barrier) |
| sub_cmd->submit_info.flags |= PVR_WINSYS_COMPUTE_FLAG_PREVENT_ALL_OVERLAP; |
| |
| pvr_csb_pack (&sub_cmd->submit_info.regs.cdm_ctrl_stream_base, |
| CR_CDM_CTRL_STREAM_BASE, |
| value) { |
| value.addr = pvr_csb_get_start_address(&sub_cmd->control_stream); |
| } |
| |
| /* FIXME: Need to set up the border color table at device creation |
| * time. Set to invalid for the time being. |
| */ |
| pvr_csb_pack (&sub_cmd->submit_info.regs.tpu_border_colour_table, |
| CR_TPU_BORDER_COLOUR_TABLE_CDM, |
| value) { |
| value.border_colour_table_address = PVR_DEV_ADDR_INVALID; |
| } |
| |
| sub_cmd->num_shared_regs = MAX2(cmd_buffer->device->idfwdf_state.usc_shareds, |
| cmd_buffer->state.max_shared_regs); |
| |
| cmd_buffer->state.max_shared_regs = 0U; |
| |
| if (PVR_HAS_FEATURE(dev_info, compute_morton_capable)) |
| sub_cmd->submit_info.regs.cdm_item = 0; |
| |
| pvr_csb_pack (&sub_cmd->submit_info.regs.tpu, CR_TPU, value) { |
| value.tag_cem_4k_face_packing = true; |
| } |
| |
| if (PVR_HAS_FEATURE(dev_info, cluster_grouping) && |
| PVR_HAS_FEATURE(dev_info, slc_mcu_cache_controls) && |
| dev_runtime_info->num_phantoms > 1 && sub_cmd->uses_atomic_ops) { |
| /* Each phantom has its own MCU, so atomicity can only be guaranteed |
| * when all work items are processed on the same phantom. This means we |
| * need to disable all USCs other than those of the first phantom, which |
| * has 4 clusters. |
| */ |
| pvr_csb_pack (&sub_cmd->submit_info.regs.compute_cluster, |
| CR_COMPUTE_CLUSTER, |
| value) { |
| value.mask = 0xFU; |
| } |
| } else { |
| pvr_csb_pack (&sub_cmd->submit_info.regs.compute_cluster, |
| CR_COMPUTE_CLUSTER, |
| value) { |
| value.mask = 0U; |
| } |
| } |
| |
| if (PVR_HAS_FEATURE(dev_info, gpu_multicore_support) && |
| sub_cmd->uses_atomic_ops) { |
| sub_cmd->submit_info.flags |= PVR_WINSYS_COMPUTE_FLAG_SINGLE_CORE; |
| } |
| } |
| |
| #define PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS \ |
| (1024 / PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)) |
| |
| static uint32_t |
| pvr_compute_flat_slot_size(const struct pvr_physical_device *pdevice, |
| uint32_t coeff_regs_count, |
| bool use_barrier, |
| uint32_t total_workitems) |
| { |
| const struct pvr_device_runtime_info *dev_runtime_info = |
| &pdevice->dev_runtime_info; |
| const struct pvr_device_info *dev_info = &pdevice->dev_info; |
| uint32_t max_workgroups_per_task = ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK; |
| uint32_t max_avail_coeff_regs = |
| dev_runtime_info->cdm_max_local_mem_size_regs; |
| uint32_t localstore_chunks_count = |
| DIV_ROUND_UP(coeff_regs_count << 2, |
| PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)); |
| |
| /* Ensure that we cannot have more workgroups in a slot than the available |
| * number of coefficients allow us to have. |
| */ |
| if (coeff_regs_count > 0U) { |
| /* If TA or 3D can overlap with CDM, or if the TA is running a geometry |
| * shader then we need to consider this in calculating max allowed |
| * work-groups. |
| */ |
| if (PVR_HAS_QUIRK(dev_info, 52354) && |
| (PVR_HAS_FEATURE(dev_info, compute_overlap) || |
| PVR_HAS_FEATURE(dev_info, gs_rta_support))) { |
| /* Solve for n (number of work-groups per task). All values are in |
| * size of common store alloc blocks: |
| * |
| * n + (2n + 7) * (local_memory_size_max - 1) = |
| * (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max) |
| * ==> |
| * n + 2n * (local_memory_size_max - 1) = |
| * (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max) |
| * - (7 * (local_memory_size_max - 1)) |
| * ==> |
| * n * (1 + 2 * (local_memory_size_max - 1)) = |
| * (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max) |
| * - (7 * (local_memory_size_max - 1)) |
| * ==> |
| * n = ((coefficient_memory_pool_size) - |
| * (7 * pixel_allocation_size_max) - |
| * (7 * (local_memory_size_max - 1)) / (1 + |
| * 2 * (local_memory_size_max - 1))) |
| */ |
| uint32_t max_common_store_blocks = |
| DIV_ROUND_UP(max_avail_coeff_regs * 4U, |
| PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)); |
| |
| /* (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max) |
| */ |
| max_common_store_blocks -= ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES * |
| PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS; |
| |
| /* - (7 * (local_memory_size_max - 1)) */ |
| max_common_store_blocks -= (ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES * |
| (localstore_chunks_count - 1U)); |
| |
| /* Divide by (1 + 2 * (local_memory_size_max - 1)) */ |
| max_workgroups_per_task = max_common_store_blocks / |
| (1U + 2U * (localstore_chunks_count - 1U)); |
| |
| max_workgroups_per_task = |
| MIN2(max_workgroups_per_task, |
| ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK); |
| |
| } else { |
| max_workgroups_per_task = |
| MIN2((max_avail_coeff_regs / coeff_regs_count), |
| max_workgroups_per_task); |
| } |
| } |
| |
| /* max_workgroups_per_task should at least be one. */ |
| assert(max_workgroups_per_task >= 1U); |
| |
| if (total_workitems >= ROGUE_MAX_INSTANCES_PER_TASK) { |
| /* In this case, the work group size will have been padded up to the |
| * next ROGUE_MAX_INSTANCES_PER_TASK so we just set max instances to be |
| * ROGUE_MAX_INSTANCES_PER_TASK. |
| */ |
| return ROGUE_MAX_INSTANCES_PER_TASK; |
| } |
| |
| /* In this case, the number of instances in the slot must be clamped to |
| * accommodate whole work-groups only. |
| */ |
| if (PVR_HAS_QUIRK(dev_info, 49032) || use_barrier) { |
| max_workgroups_per_task = |
| MIN2(max_workgroups_per_task, |
| ROGUE_MAX_INSTANCES_PER_TASK / total_workitems); |
| return total_workitems * max_workgroups_per_task; |
| } |
| |
| return MIN2(total_workitems * max_workgroups_per_task, |
| ROGUE_MAX_INSTANCES_PER_TASK); |
| } |
| |
| static void |
| pvr_compute_generate_control_stream(struct pvr_csb *csb, |
| struct pvr_sub_cmd_compute *sub_cmd, |
| const struct pvr_compute_kernel_info *info) |
| { |
| /* Compute kernel 0. */ |
| pvr_csb_emit (csb, CDMCTRL_KERNEL0, kernel0) { |
| kernel0.indirect_present = !!info->indirect_buffer_addr.addr; |
| kernel0.global_offsets_present = info->global_offsets_present; |
| kernel0.usc_common_size = info->usc_common_size; |
| kernel0.usc_unified_size = info->usc_unified_size; |
| kernel0.pds_temp_size = info->pds_temp_size; |
| kernel0.pds_data_size = info->pds_data_size; |
| kernel0.usc_target = info->usc_target; |
| kernel0.fence = info->is_fence; |
| } |
| |
| /* Compute kernel 1. */ |
| pvr_csb_emit (csb, CDMCTRL_KERNEL1, kernel1) { |
| kernel1.data_addr = PVR_DEV_ADDR(info->pds_data_offset); |
| kernel1.sd_type = info->sd_type; |
| kernel1.usc_common_shared = info->usc_common_shared; |
| } |
| |
| /* Compute kernel 2. */ |
| pvr_csb_emit (csb, CDMCTRL_KERNEL2, kernel2) { |
| kernel2.code_addr = PVR_DEV_ADDR(info->pds_code_offset); |
| } |
| |
| if (info->indirect_buffer_addr.addr) { |
| /* Compute kernel 6. */ |
| pvr_csb_emit (csb, CDMCTRL_KERNEL6, kernel6) { |
| kernel6.indirect_addrmsb = info->indirect_buffer_addr; |
| } |
| |
| /* Compute kernel 7. */ |
| pvr_csb_emit (csb, CDMCTRL_KERNEL7, kernel7) { |
| kernel7.indirect_addrlsb = info->indirect_buffer_addr; |
| } |
| } else { |
| /* Compute kernel 3. */ |
| pvr_csb_emit (csb, CDMCTRL_KERNEL3, kernel3) { |
| assert(info->global_size[0U] > 0U); |
| kernel3.workgroup_x = info->global_size[0U] - 1U; |
| } |
| |
| /* Compute kernel 4. */ |
| pvr_csb_emit (csb, CDMCTRL_KERNEL4, kernel4) { |
| assert(info->global_size[1U] > 0U); |
| kernel4.workgroup_y = info->global_size[1U] - 1U; |
| } |
| |
| /* Compute kernel 5. */ |
| pvr_csb_emit (csb, CDMCTRL_KERNEL5, kernel5) { |
| assert(info->global_size[2U] > 0U); |
| kernel5.workgroup_z = info->global_size[2U] - 1U; |
| } |
| } |
| |
| /* Compute kernel 8. */ |
| pvr_csb_emit (csb, CDMCTRL_KERNEL8, kernel8) { |
| if (info->max_instances == ROGUE_MAX_INSTANCES_PER_TASK) |
| kernel8.max_instances = 0U; |
| else |
| kernel8.max_instances = info->max_instances; |
| |
| assert(info->local_size[0U] > 0U); |
| kernel8.workgroup_size_x = info->local_size[0U] - 1U; |
| assert(info->local_size[1U] > 0U); |
| kernel8.workgroup_size_y = info->local_size[1U] - 1U; |
| assert(info->local_size[2U] > 0U); |
| kernel8.workgroup_size_z = info->local_size[2U] - 1U; |
| } |
| |
| /* Track the highest amount of shared registers usage in this dispatch. |
| * This is used by the FW for context switching, so must be large enough |
| * to contain all the shared registers that might be in use for this compute |
| * job. Coefficients don't need to be included as the context switch will not |
| * happen within the execution of a single workgroup, thus nothing needs to |
| * be preserved. |
| */ |
| if (info->usc_common_shared) { |
| sub_cmd->num_shared_regs = |
| MAX2(sub_cmd->num_shared_regs, info->usc_common_size); |
| } |
| } |
| |
| /* TODO: This can be pre-packed and uploaded directly. Would that provide any |
| * speed up? |
| */ |
| static void |
| pvr_compute_generate_idfwdf(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd_compute *const sub_cmd) |
| { |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| bool *const is_sw_barier_required = |
| &state->current_sub_cmd->compute.pds_sw_barrier_requires_clearing; |
| const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice; |
| struct pvr_csb *csb = &sub_cmd->control_stream; |
| const struct pvr_pds_upload *program; |
| |
| if (PVR_NEED_SW_COMPUTE_PDS_BARRIER(&pdevice->dev_info) && |
| *is_sw_barier_required) { |
| *is_sw_barier_required = false; |
| program = &cmd_buffer->device->idfwdf_state.sw_compute_barrier_pds; |
| } else { |
| program = &cmd_buffer->device->idfwdf_state.pds; |
| } |
| |
| struct pvr_compute_kernel_info info = { |
| .indirect_buffer_addr = PVR_DEV_ADDR_INVALID, |
| .global_offsets_present = false, |
| .usc_common_size = |
| DIV_ROUND_UP(cmd_buffer->device->idfwdf_state.usc_shareds << 2, |
| PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)), |
| .usc_unified_size = 0U, |
| .pds_temp_size = 0U, |
| .pds_data_size = |
| DIV_ROUND_UP(program->data_size << 2, |
| PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)), |
| .usc_target = PVRX(CDMCTRL_USC_TARGET_ALL), |
| .is_fence = false, |
| .pds_data_offset = program->data_offset, |
| .sd_type = PVRX(CDMCTRL_SD_TYPE_USC), |
| .usc_common_shared = true, |
| .pds_code_offset = program->code_offset, |
| .global_size = { 1U, 1U, 1U }, |
| .local_size = { 1U, 1U, 1U }, |
| }; |
| |
| /* We don't need to pad work-group size for this case. */ |
| |
| info.max_instances = |
| pvr_compute_flat_slot_size(pdevice, |
| cmd_buffer->device->idfwdf_state.usc_shareds, |
| false, |
| 1U); |
| |
| pvr_compute_generate_control_stream(csb, sub_cmd, &info); |
| } |
| |
| static void |
| pvr_compute_generate_fence(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd_compute *const sub_cmd, |
| bool deallocate_shareds) |
| { |
| const struct pvr_pds_upload *program = |
| &cmd_buffer->device->pds_compute_fence_program; |
| const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice; |
| struct pvr_csb *csb = &sub_cmd->control_stream; |
| |
| struct pvr_compute_kernel_info info = { |
| .indirect_buffer_addr = PVR_DEV_ADDR_INVALID, |
| .global_offsets_present = false, |
| .usc_common_size = 0U, |
| .usc_unified_size = 0U, |
| .pds_temp_size = 0U, |
| .pds_data_size = |
| DIV_ROUND_UP(program->data_size << 2, |
| PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)), |
| .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY), |
| .is_fence = true, |
| .pds_data_offset = program->data_offset, |
| .sd_type = PVRX(CDMCTRL_SD_TYPE_PDS), |
| .usc_common_shared = deallocate_shareds, |
| .pds_code_offset = program->code_offset, |
| .global_size = { 1U, 1U, 1U }, |
| .local_size = { 1U, 1U, 1U }, |
| }; |
| |
| /* We don't need to pad work-group size for this case. */ |
| /* Here we calculate the slot size. This can depend on the use of barriers, |
| * local memory, BRN's or other factors. |
| */ |
| info.max_instances = pvr_compute_flat_slot_size(pdevice, 0U, false, 1U); |
| |
| pvr_compute_generate_control_stream(csb, sub_cmd, &info); |
| } |
| |
| static VkResult pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer *cmd_buffer) |
| { |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| struct pvr_sub_cmd *sub_cmd = state->current_sub_cmd; |
| struct pvr_device *device = cmd_buffer->device; |
| VkResult result; |
| |
| /* FIXME: Is this NULL check required because this function is called from |
| * pvr_resolve_unemitted_resolve_attachments()? See comment about this |
| * function being called twice in a row in pvr_CmdEndRenderPass(). |
| */ |
| if (!sub_cmd) |
| return VK_SUCCESS; |
| |
| if (!sub_cmd->owned) { |
| state->current_sub_cmd = NULL; |
| return VK_SUCCESS; |
| } |
| |
| switch (sub_cmd->type) { |
| case PVR_SUB_CMD_TYPE_GRAPHICS: { |
| struct pvr_sub_cmd_gfx *const gfx_sub_cmd = &sub_cmd->gfx; |
| |
| if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { |
| result = pvr_csb_emit_return(&gfx_sub_cmd->control_stream); |
| if (result != VK_SUCCESS) { |
| state->status = result; |
| return result; |
| } |
| |
| break; |
| } |
| |
| /* TODO: Check if the sub_cmd can be skipped based on |
| * sub_cmd->gfx.empty_cmd flag. |
| */ |
| |
| result = pvr_cmd_buffer_upload_tables(device, cmd_buffer, gfx_sub_cmd); |
| if (result != VK_SUCCESS) { |
| state->status = result; |
| return result; |
| } |
| |
| result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer, gfx_sub_cmd); |
| if (result != VK_SUCCESS) { |
| state->status = result; |
| return result; |
| } |
| |
| result = pvr_csb_emit_terminate(&gfx_sub_cmd->control_stream); |
| if (result != VK_SUCCESS) { |
| state->status = result; |
| return result; |
| } |
| |
| result = pvr_sub_cmd_gfx_job_init(&device->pdevice->dev_info, |
| cmd_buffer, |
| gfx_sub_cmd); |
| if (result != VK_SUCCESS) { |
| state->status = result; |
| return result; |
| } |
| |
| break; |
| } |
| |
| case PVR_SUB_CMD_TYPE_COMPUTE: { |
| struct pvr_sub_cmd_compute *const compute_sub_cmd = &sub_cmd->compute; |
| |
| pvr_compute_generate_fence(cmd_buffer, compute_sub_cmd, true); |
| |
| result = pvr_csb_emit_terminate(&compute_sub_cmd->control_stream); |
| if (result != VK_SUCCESS) { |
| state->status = result; |
| return result; |
| } |
| |
| pvr_sub_cmd_compute_job_init(device->pdevice, |
| cmd_buffer, |
| compute_sub_cmd); |
| break; |
| } |
| |
| case PVR_SUB_CMD_TYPE_TRANSFER: |
| break; |
| |
| case PVR_SUB_CMD_TYPE_EVENT: |
| break; |
| |
| default: |
| pvr_finishme("Unsupported sub-command type %d", sub_cmd->type); |
| break; |
| } |
| |
| state->current_sub_cmd = NULL; |
| |
| return VK_SUCCESS; |
| } |
| |
| static void pvr_reset_graphics_dirty_state(struct pvr_cmd_buffer_state *state, |
| bool start_geom) |
| { |
| if (start_geom) { |
| /* |
| * Initial geometry phase state. |
| * It's the driver's responsibility to ensure that the state of the |
| * hardware is correctly initialized at the start of every geometry |
| * phase. This is required to prevent stale state from a previous |
| * geometry phase erroneously affecting the next geometry phase. |
| * |
| * If a geometry phase does not contain any geometry, this restriction |
| * can be ignored. If the first draw call in a geometry phase will only |
| * update the depth or stencil buffers i.e. ISP_TAGWRITEDISABLE is set |
| * in the ISP State Control Word, the PDS State Pointers |
| * (TA_PRES_PDSSTATEPTR*) in the first PPP State Update do not need to |
| * be supplied, since they will never reach the PDS in the fragment |
| * phase. |
| */ |
| |
| state->emit_header = (struct PVRX(TA_STATE_HEADER)){ |
| .pres_stream_out_size = true, |
| .pres_ppp_ctrl = true, |
| .pres_varying_word2 = true, |
| .pres_varying_word1 = true, |
| .pres_varying_word0 = true, |
| .pres_outselects = true, |
| .pres_wclamp = true, |
| .pres_viewport = true, |
| .pres_region_clip = true, |
| .pres_pds_state_ptr0 = true, |
| .pres_ispctl_fb = true, |
| .pres_ispctl = true, |
| }; |
| } else { |
| state->emit_header.pres_ppp_ctrl = true; |
| state->emit_header.pres_varying_word1 = true; |
| state->emit_header.pres_varying_word0 = true; |
| state->emit_header.pres_outselects = true; |
| state->emit_header.pres_viewport = true; |
| state->emit_header.pres_region_clip = true; |
| state->emit_header.pres_pds_state_ptr0 = true; |
| state->emit_header.pres_ispctl_fb = true; |
| state->emit_header.pres_ispctl = true; |
| } |
| |
| memset(&state->ppp_state, 0U, sizeof(state->ppp_state)); |
| |
| state->dirty.vertex_bindings = true; |
| state->dirty.gfx_pipeline_binding = true; |
| state->dirty.viewport = true; |
| } |
| |
| static inline bool |
| pvr_cmd_uses_deferred_cs_cmds(const struct pvr_cmd_buffer *const cmd_buffer) |
| { |
| const VkCommandBufferUsageFlags deferred_control_stream_flags = |
| VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT & |
| VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; |
| |
| return cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY && |
| (cmd_buffer->usage_flags & deferred_control_stream_flags) == |
| deferred_control_stream_flags; |
| } |
| |
| static VkResult pvr_cmd_buffer_start_sub_cmd(struct pvr_cmd_buffer *cmd_buffer, |
| enum pvr_sub_cmd_type type) |
| { |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| struct pvr_device *device = cmd_buffer->device; |
| struct pvr_sub_cmd *sub_cmd; |
| VkResult result; |
| |
| /* Check the current status of the buffer. */ |
| if (state->status != VK_SUCCESS) |
| return state->status; |
| |
| pvr_cmd_buffer_update_barriers(cmd_buffer, type); |
| |
| /* TODO: Add proper support for joining consecutive event sub_cmd? */ |
| if (state->current_sub_cmd) { |
| if (state->current_sub_cmd->type == type) { |
| /* Continue adding to the current sub command. */ |
| return VK_SUCCESS; |
| } |
| |
| /* End the current sub command. */ |
| result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return result; |
| } |
| |
| sub_cmd = vk_zalloc(&cmd_buffer->vk.pool->alloc, |
| sizeof(*sub_cmd), |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); |
| if (!sub_cmd) { |
| state->status = vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY); |
| return state->status; |
| } |
| |
| sub_cmd->type = type; |
| sub_cmd->owned = true; |
| |
| switch (type) { |
| case PVR_SUB_CMD_TYPE_GRAPHICS: |
| |
| sub_cmd->gfx.depth_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED; |
| sub_cmd->gfx.stencil_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED; |
| sub_cmd->gfx.modifies_depth = false; |
| sub_cmd->gfx.modifies_stencil = false; |
| sub_cmd->gfx.max_tiles_in_flight = |
| PVR_GET_FEATURE_VALUE(&device->pdevice->dev_info, |
| isp_max_tiles_in_flight, |
| 1); |
| sub_cmd->gfx.hw_render_idx = state->render_pass_info.current_hw_subpass; |
| sub_cmd->gfx.framebuffer = state->render_pass_info.framebuffer; |
| sub_cmd->gfx.empty_cmd = true; |
| |
| pvr_reset_graphics_dirty_state(state, true); |
| if (pvr_cmd_uses_deferred_cs_cmds(cmd_buffer)) { |
| pvr_csb_init(device, |
| PVR_CMD_STREAM_TYPE_GRAPHICS_DEFERRED, |
| &sub_cmd->gfx.control_stream); |
| } else { |
| pvr_csb_init(device, |
| PVR_CMD_STREAM_TYPE_GRAPHICS, |
| &sub_cmd->gfx.control_stream); |
| } |
| break; |
| |
| case PVR_SUB_CMD_TYPE_COMPUTE: |
| pvr_csb_init(device, |
| PVR_CMD_STREAM_TYPE_COMPUTE, |
| &sub_cmd->compute.control_stream); |
| break; |
| |
| case PVR_SUB_CMD_TYPE_TRANSFER: |
| list_inithead(&sub_cmd->transfer.transfer_cmds); |
| break; |
| |
| case PVR_SUB_CMD_TYPE_EVENT: |
| break; |
| |
| default: |
| pvr_finishme("Unsupported sub-command type %d", type); |
| break; |
| } |
| |
| list_addtail(&sub_cmd->link, &cmd_buffer->sub_cmds); |
| state->current_sub_cmd = sub_cmd; |
| |
| return VK_SUCCESS; |
| } |
| |
| VkResult pvr_cmd_buffer_alloc_mem(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_winsys_heap *heap, |
| uint64_t size, |
| uint32_t flags, |
| struct pvr_bo **const pvr_bo_out) |
| { |
| const uint32_t cache_line_size = |
| rogue_get_slc_cache_line_size(&cmd_buffer->device->pdevice->dev_info); |
| struct pvr_bo *pvr_bo; |
| VkResult result; |
| |
| result = pvr_bo_alloc(cmd_buffer->device, |
| heap, |
| size, |
| cache_line_size, |
| flags, |
| &pvr_bo); |
| if (result != VK_SUCCESS) { |
| cmd_buffer->state.status = result; |
| return result; |
| } |
| |
| list_add(&pvr_bo->link, &cmd_buffer->bo_list); |
| |
| *pvr_bo_out = pvr_bo; |
| |
| return VK_SUCCESS; |
| } |
| |
| static void pvr_cmd_bind_compute_pipeline( |
| const struct pvr_compute_pipeline *const compute_pipeline, |
| struct pvr_cmd_buffer *const cmd_buffer) |
| { |
| cmd_buffer->state.compute_pipeline = compute_pipeline; |
| cmd_buffer->state.dirty.compute_pipeline_binding = true; |
| } |
| |
| static void pvr_cmd_bind_graphics_pipeline( |
| const struct pvr_graphics_pipeline *const gfx_pipeline, |
| struct pvr_cmd_buffer *const cmd_buffer) |
| { |
| struct pvr_dynamic_state *const dest_state = |
| &cmd_buffer->state.dynamic.common; |
| const struct pvr_dynamic_state *const src_state = |
| &gfx_pipeline->dynamic_state; |
| struct pvr_cmd_buffer_state *const cmd_buffer_state = &cmd_buffer->state; |
| const uint32_t state_mask = src_state->mask; |
| |
| cmd_buffer_state->gfx_pipeline = gfx_pipeline; |
| cmd_buffer_state->dirty.gfx_pipeline_binding = true; |
| |
| /* FIXME: Handle PVR_DYNAMIC_STATE_BIT_VIEWPORT. */ |
| if (!(state_mask & PVR_DYNAMIC_STATE_BIT_VIEWPORT)) { |
| assert(!"Unimplemented"); |
| } |
| |
| /* FIXME: Handle PVR_DYNAMIC_STATE_BIT_SCISSOR. */ |
| if (!(state_mask & PVR_DYNAMIC_STATE_BIT_SCISSOR)) { |
| assert(!"Unimplemented"); |
| } |
| |
| if (!(state_mask & PVR_DYNAMIC_STATE_BIT_LINE_WIDTH)) { |
| dest_state->line_width = src_state->line_width; |
| |
| cmd_buffer_state->dirty.line_width = true; |
| } |
| |
| if (!(state_mask & PVR_DYNAMIC_STATE_BIT_DEPTH_BIAS)) { |
| memcpy(&dest_state->depth_bias, |
| &src_state->depth_bias, |
| sizeof(src_state->depth_bias)); |
| |
| cmd_buffer_state->dirty.depth_bias = true; |
| } |
| |
| if (!(state_mask & PVR_DYNAMIC_STATE_BIT_BLEND_CONSTANTS)) { |
| STATIC_ASSERT( |
| __same_type(dest_state->blend_constants, src_state->blend_constants)); |
| |
| typed_memcpy(dest_state->blend_constants, |
| src_state->blend_constants, |
| ARRAY_SIZE(dest_state->blend_constants)); |
| |
| cmd_buffer_state->dirty.blend_constants = true; |
| } |
| |
| if (!(state_mask & PVR_DYNAMIC_STATE_BIT_STENCIL_COMPARE_MASK)) { |
| dest_state->compare_mask.front = src_state->compare_mask.front; |
| dest_state->compare_mask.back = src_state->compare_mask.back; |
| |
| cmd_buffer_state->dirty.compare_mask = true; |
| } |
| |
| if (!(state_mask & PVR_DYNAMIC_STATE_BIT_STENCIL_WRITE_MASK)) { |
| dest_state->write_mask.front = src_state->write_mask.front; |
| dest_state->write_mask.back = src_state->write_mask.back; |
| |
| cmd_buffer_state->dirty.write_mask = true; |
| } |
| |
| if (!(state_mask & PVR_DYNAMIC_STATE_BIT_STENCIL_REFERENCE)) { |
| dest_state->reference.front = src_state->reference.front; |
| dest_state->reference.back = src_state->reference.back; |
| |
| cmd_buffer_state->dirty.reference = true; |
| } |
| } |
| |
| void pvr_CmdBindPipeline(VkCommandBuffer commandBuffer, |
| VkPipelineBindPoint pipelineBindPoint, |
| VkPipeline _pipeline) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline); |
| |
| switch (pipelineBindPoint) { |
| case VK_PIPELINE_BIND_POINT_COMPUTE: |
| pvr_cmd_bind_compute_pipeline(to_pvr_compute_pipeline(pipeline), |
| cmd_buffer); |
| break; |
| |
| case VK_PIPELINE_BIND_POINT_GRAPHICS: |
| pvr_cmd_bind_graphics_pipeline(to_pvr_graphics_pipeline(pipeline), |
| cmd_buffer); |
| break; |
| |
| default: |
| unreachable("Invalid bind point."); |
| break; |
| } |
| } |
| |
| #if defined(DEBUG) |
| static void check_viewport_quirk_70165(const struct pvr_device *device, |
| const VkViewport *pViewport) |
| { |
| const struct pvr_device_info *dev_info = &device->pdevice->dev_info; |
| float min_vertex_x, max_vertex_x, min_vertex_y, max_vertex_y; |
| float min_screen_space_value, max_screen_space_value; |
| float sign_to_unsigned_offset, fixed_point_max; |
| float guardband_width, guardband_height; |
| |
| if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) { |
| /* Max representable value in 13.4 fixed point format. |
| * Round-down to avoid precision issues. |
| * Calculated as (2 ** 13) - 2*(2 ** -4) |
| */ |
| fixed_point_max = 8192.0f - 2.0f / 16.0f; |
| |
| if (PVR_HAS_FEATURE(dev_info, screen_size8K)) { |
| if (pViewport->width <= 4096 && pViewport->height <= 4096) { |
| guardband_width = pViewport->width / 4.0f; |
| guardband_height = pViewport->height / 4.0f; |
| |
| /* 2k of the range is negative */ |
| sign_to_unsigned_offset = 2048.0f; |
| } else { |
| guardband_width = 0.0f; |
| guardband_height = 0.0f; |
| |
| /* For > 4k renders, the entire range is positive */ |
| sign_to_unsigned_offset = 0.0f; |
| } |
| } else { |
| guardband_width = pViewport->width / 4.0f; |
| guardband_height = pViewport->height / 4.0f; |
| |
| /* 2k of the range is negative */ |
| sign_to_unsigned_offset = 2048.0f; |
| } |
| } else { |
| /* Max representable value in 16.8 fixed point format |
| * Calculated as (2 ** 16) - (2 ** -8) |
| */ |
| fixed_point_max = 65535.99609375f; |
| guardband_width = pViewport->width / 4.0f; |
| guardband_height = pViewport->height / 4.0f; |
| |
| /* 4k/20k of the range is negative */ |
| sign_to_unsigned_offset = (float)PVR_MAX_NEG_OFFSCREEN_OFFSET; |
| } |
| |
| min_screen_space_value = -sign_to_unsigned_offset; |
| max_screen_space_value = fixed_point_max - sign_to_unsigned_offset; |
| |
| min_vertex_x = pViewport->x - guardband_width; |
| max_vertex_x = pViewport->x + pViewport->width + guardband_width; |
| min_vertex_y = pViewport->y - guardband_height; |
| max_vertex_y = pViewport->y + pViewport->height + guardband_height; |
| if (min_vertex_x < min_screen_space_value || |
| max_vertex_x > max_screen_space_value || |
| min_vertex_y < min_screen_space_value || |
| max_vertex_y > max_screen_space_value) { |
| mesa_logw("Viewport is affected by BRN70165, geometry outside " |
| "the viewport could be corrupted"); |
| } |
| } |
| #endif |
| |
| void pvr_CmdSetViewport(VkCommandBuffer commandBuffer, |
| uint32_t firstViewport, |
| uint32_t viewportCount, |
| const VkViewport *pViewports) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| const uint32_t total_count = firstViewport + viewportCount; |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| |
| assert(firstViewport < PVR_MAX_VIEWPORTS && viewportCount > 0); |
| assert(total_count >= 1 && total_count <= PVR_MAX_VIEWPORTS); |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| #if defined(DEBUG) |
| if (PVR_HAS_QUIRK(&cmd_buffer->device->pdevice->dev_info, 70165)) { |
| for (uint32_t viewport = 0; viewport < viewportCount; viewport++) { |
| check_viewport_quirk_70165(cmd_buffer->device, &pViewports[viewport]); |
| } |
| } |
| #endif |
| |
| if (state->dynamic.common.viewport.count < total_count) |
| state->dynamic.common.viewport.count = total_count; |
| |
| memcpy(&state->dynamic.common.viewport.viewports[firstViewport], |
| pViewports, |
| viewportCount * sizeof(*pViewports)); |
| |
| state->dirty.viewport = true; |
| } |
| |
| void pvr_CmdSetScissor(VkCommandBuffer commandBuffer, |
| uint32_t firstScissor, |
| uint32_t scissorCount, |
| const VkRect2D *pScissors) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| const uint32_t total_count = firstScissor + scissorCount; |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| |
| assert(firstScissor < PVR_MAX_VIEWPORTS && scissorCount > 0); |
| assert(total_count >= 1 && total_count <= PVR_MAX_VIEWPORTS); |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| if (state->dynamic.common.scissor.count < total_count) |
| state->dynamic.common.scissor.count = total_count; |
| |
| memcpy(&state->dynamic.common.scissor.scissors[firstScissor], |
| pScissors, |
| scissorCount * sizeof(*pScissors)); |
| |
| state->dirty.scissor = true; |
| } |
| |
| void pvr_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| |
| state->dynamic.common.line_width = lineWidth; |
| state->dirty.line_width = true; |
| } |
| |
| void pvr_CmdSetDepthBias(VkCommandBuffer commandBuffer, |
| float depthBiasConstantFactor, |
| float depthBiasClamp, |
| float depthBiasSlopeFactor) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| |
| state->dynamic.common.depth_bias.constant_factor = depthBiasConstantFactor; |
| state->dynamic.common.depth_bias.slope_factor = depthBiasSlopeFactor; |
| state->dynamic.common.depth_bias.clamp = depthBiasClamp; |
| state->dirty.depth_bias = true; |
| } |
| |
| void pvr_CmdSetBlendConstants(VkCommandBuffer commandBuffer, |
| const float blendConstants[4]) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| |
| STATIC_ASSERT(ARRAY_SIZE(state->dynamic.common.blend_constants) == 4); |
| memcpy(state->dynamic.common.blend_constants, |
| blendConstants, |
| sizeof(state->dynamic.common.blend_constants)); |
| |
| state->dirty.blend_constants = true; |
| } |
| |
| void pvr_CmdSetDepthBounds(VkCommandBuffer commandBuffer, |
| float minDepthBounds, |
| float maxDepthBounds) |
| { |
| mesa_logd("No support for depth bounds testing."); |
| } |
| |
| void pvr_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer, |
| VkStencilFaceFlags faceMask, |
| uint32_t compareMask) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| |
| if (faceMask & VK_STENCIL_FACE_FRONT_BIT) |
| state->dynamic.common.compare_mask.front = compareMask; |
| |
| if (faceMask & VK_STENCIL_FACE_BACK_BIT) |
| state->dynamic.common.compare_mask.back = compareMask; |
| |
| state->dirty.compare_mask = true; |
| } |
| |
| void pvr_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer, |
| VkStencilFaceFlags faceMask, |
| uint32_t writeMask) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| |
| if (faceMask & VK_STENCIL_FACE_FRONT_BIT) |
| state->dynamic.common.write_mask.front = writeMask; |
| |
| if (faceMask & VK_STENCIL_FACE_BACK_BIT) |
| state->dynamic.common.write_mask.back = writeMask; |
| |
| state->dirty.write_mask = true; |
| } |
| |
| void pvr_CmdSetStencilReference(VkCommandBuffer commandBuffer, |
| VkStencilFaceFlags faceMask, |
| uint32_t reference) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| |
| if (faceMask & VK_STENCIL_FACE_FRONT_BIT) |
| state->dynamic.common.reference.front = reference; |
| |
| if (faceMask & VK_STENCIL_FACE_BACK_BIT) |
| state->dynamic.common.reference.back = reference; |
| |
| state->dirty.reference = true; |
| } |
| |
| void pvr_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, |
| VkPipelineBindPoint pipelineBindPoint, |
| VkPipelineLayout _layout, |
| uint32_t firstSet, |
| uint32_t descriptorSetCount, |
| const VkDescriptorSet *pDescriptorSets, |
| uint32_t dynamicOffsetCount, |
| const uint32_t *pDynamicOffsets) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_descriptor_state *descriptor_state; |
| |
| assert(firstSet + descriptorSetCount <= PVR_MAX_DESCRIPTOR_SETS); |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| switch (pipelineBindPoint) { |
| case VK_PIPELINE_BIND_POINT_GRAPHICS: |
| case VK_PIPELINE_BIND_POINT_COMPUTE: |
| break; |
| |
| default: |
| unreachable("Unsupported bind point."); |
| break; |
| } |
| |
| if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) { |
| descriptor_state = &cmd_buffer->state.gfx_desc_state; |
| cmd_buffer->state.dirty.gfx_desc_dirty = true; |
| } else { |
| descriptor_state = &cmd_buffer->state.compute_desc_state; |
| cmd_buffer->state.dirty.compute_desc_dirty = true; |
| } |
| |
| for (uint32_t i = 0; i < descriptorSetCount; i++) { |
| PVR_FROM_HANDLE(pvr_descriptor_set, set, pDescriptorSets[i]); |
| uint32_t index = firstSet + i; |
| |
| if (descriptor_state->descriptor_sets[index] != set) { |
| descriptor_state->descriptor_sets[index] = set; |
| descriptor_state->valid_mask |= (1u << index); |
| } |
| } |
| } |
| |
| void pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer, |
| uint32_t firstBinding, |
| uint32_t bindingCount, |
| const VkBuffer *pBuffers, |
| const VkDeviceSize *pOffsets) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_vertex_binding *const vb = cmd_buffer->state.vertex_bindings; |
| |
| /* We have to defer setting up vertex buffer since we need the buffer |
| * stride from the pipeline. |
| */ |
| |
| assert(firstBinding < PVR_MAX_VERTEX_INPUT_BINDINGS && |
| bindingCount <= PVR_MAX_VERTEX_INPUT_BINDINGS); |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| for (uint32_t i = 0; i < bindingCount; i++) { |
| vb[firstBinding + i].buffer = pvr_buffer_from_handle(pBuffers[i]); |
| vb[firstBinding + i].offset = pOffsets[i]; |
| } |
| |
| cmd_buffer->state.dirty.vertex_bindings = true; |
| } |
| |
| void pvr_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, |
| VkBuffer buffer, |
| VkDeviceSize offset, |
| VkIndexType indexType) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| PVR_FROM_HANDLE(pvr_buffer, index_buffer, buffer); |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| |
| assert(offset < index_buffer->vk.size); |
| assert(indexType == VK_INDEX_TYPE_UINT32 || |
| indexType == VK_INDEX_TYPE_UINT16); |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| state->index_buffer_binding.buffer = index_buffer; |
| state->index_buffer_binding.offset = offset; |
| state->index_buffer_binding.type = indexType; |
| state->dirty.index_buffer_binding = true; |
| } |
| |
| void pvr_CmdPushConstants(VkCommandBuffer commandBuffer, |
| VkPipelineLayout layout, |
| VkShaderStageFlags stageFlags, |
| uint32_t offset, |
| uint32_t size, |
| const void *pValues) |
| { |
| #if defined(DEBUG) |
| const uint64_t ending = (uint64_t)offset + (uint64_t)size; |
| #endif |
| |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| pvr_assert(ending <= PVR_MAX_PUSH_CONSTANTS_SIZE); |
| |
| memcpy(&state->push_constants.data[offset], pValues, size); |
| |
| state->push_constants.dirty_stages |= stageFlags; |
| } |
| |
| static VkResult |
| pvr_cmd_buffer_setup_attachments(struct pvr_cmd_buffer *cmd_buffer, |
| const struct pvr_render_pass *pass, |
| const struct pvr_framebuffer *framebuffer) |
| { |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| struct pvr_render_pass_info *info = &state->render_pass_info; |
| |
| assert(pass->attachment_count == framebuffer->attachment_count); |
| |
| /* Free any previously allocated attachments. */ |
| vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.attachments); |
| |
| if (pass->attachment_count == 0) { |
| info->attachments = NULL; |
| return VK_SUCCESS; |
| } |
| |
| info->attachments = |
| vk_zalloc(&cmd_buffer->vk.pool->alloc, |
| pass->attachment_count * sizeof(*info->attachments), |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); |
| if (!info->attachments) { |
| /* Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */ |
| state->status = vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY); |
| return state->status; |
| } |
| |
| for (uint32_t i = 0; i < pass->attachment_count; i++) |
| info->attachments[i] = framebuffer->attachments[i]; |
| |
| return VK_SUCCESS; |
| } |
| |
| static VkResult pvr_init_render_targets(struct pvr_device *device, |
| struct pvr_render_pass *pass, |
| struct pvr_framebuffer *framebuffer) |
| { |
| for (uint32_t i = 0; i < pass->hw_setup->render_count; i++) { |
| struct pvr_render_target *render_target = |
| pvr_get_render_target(pass, framebuffer, i); |
| |
| pthread_mutex_lock(&render_target->mutex); |
| |
| if (!render_target->valid) { |
| const struct pvr_renderpass_hwsetup_render *hw_render = |
| &pass->hw_setup->renders[i]; |
| VkResult result; |
| |
| result = pvr_render_target_dataset_create(device, |
| framebuffer->width, |
| framebuffer->height, |
| hw_render->sample_count, |
| framebuffer->layers, |
| &render_target->rt_dataset); |
| if (result != VK_SUCCESS) { |
| pthread_mutex_unlock(&render_target->mutex); |
| return result; |
| } |
| |
| render_target->valid = true; |
| } |
| |
| pthread_mutex_unlock(&render_target->mutex); |
| } |
| |
| return VK_SUCCESS; |
| } |
| |
| static const struct pvr_renderpass_hwsetup_subpass * |
| pvr_get_hw_subpass(const struct pvr_render_pass *pass, const uint32_t subpass) |
| { |
| const struct pvr_renderpass_hw_map *map = |
| &pass->hw_setup->subpass_map[subpass]; |
| |
| return &pass->hw_setup->renders[map->render].subpasses[map->subpass]; |
| } |
| |
| static void pvr_perform_start_of_render_attachment_clear( |
| struct pvr_cmd_buffer *cmd_buffer, |
| const struct pvr_framebuffer *framebuffer, |
| uint32_t index, |
| bool is_depth_stencil, |
| uint32_t *index_list_clear_mask) |
| { |
| struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info; |
| const struct pvr_render_pass *pass = info->pass; |
| const struct pvr_renderpass_hwsetup *hw_setup = pass->hw_setup; |
| const struct pvr_renderpass_hwsetup_render *hw_render = |
| &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render]; |
| struct pvr_image_view *iview; |
| uint32_t view_idx; |
| uint32_t height; |
| uint32_t width; |
| |
| if (is_depth_stencil) { |
| bool stencil_clear; |
| bool depth_clear; |
| bool is_stencil; |
| bool is_depth; |
| |
| assert(hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED); |
| assert(index == 0); |
| |
| view_idx = hw_render->ds_attach_idx; |
| |
| is_depth = vk_format_has_depth(pass->attachments[view_idx].vk_format); |
| is_stencil = vk_format_has_stencil(pass->attachments[view_idx].vk_format); |
| depth_clear = hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_CLEAR; |
| stencil_clear = hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_CLEAR; |
| |
| /* Attempt to clear the ds attachment. Do not erroneously discard an |
| * attachment that has no depth clear but has a stencil attachment. |
| */ |
| /* if not (a ∧ c) ∨ (b ∧ d) */ |
| if (!((is_depth && depth_clear) || (is_stencil && stencil_clear))) |
| return; |
| } else if (hw_render->color_init[index].op != VK_ATTACHMENT_LOAD_OP_CLEAR) { |
| return; |
| } else { |
| view_idx = hw_render->color_init[index].index; |
| } |
| |
| iview = info->attachments[view_idx]; |
| width = iview->vk.extent.width; |
| height = iview->vk.extent.height; |
| |
| /* FIXME: It would be nice if this function and pvr_sub_cmd_gfx_job_init() |
| * were doing the same check (even if it's just an assert) to determine if a |
| * clear is needed. |
| */ |
| /* If this is single-layer fullscreen, we already do the clears in |
| * pvr_sub_cmd_gfx_job_init(). |
| */ |
| if (info->render_area.offset.x == 0 && info->render_area.offset.y == 0 && |
| info->render_area.extent.width == width && |
| info->render_area.extent.height == height && framebuffer->layers == 1) { |
| return; |
| } |
| |
| pvr_finishme("Unimplemented path!"); |
| } |
| |
| static void |
| pvr_perform_start_of_render_clears(struct pvr_cmd_buffer *cmd_buffer) |
| { |
| struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info; |
| const struct pvr_framebuffer *framebuffer = info->framebuffer; |
| const struct pvr_render_pass *pass = info->pass; |
| const struct pvr_renderpass_hwsetup *hw_setup = pass->hw_setup; |
| const struct pvr_renderpass_hwsetup_render *hw_render = |
| &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render]; |
| |
| /* Mask of attachment clears using index lists instead of background object |
| * to clear. |
| */ |
| uint32_t index_list_clear_mask = 0; |
| |
| for (uint32_t i = 0; i < hw_render->color_init_count; i++) { |
| pvr_perform_start_of_render_attachment_clear(cmd_buffer, |
| framebuffer, |
| i, |
| false, |
| &index_list_clear_mask); |
| } |
| |
| info->enable_bg_tag = !!hw_render->color_init_count; |
| |
| /* If we're not using index list for all clears/loads then we need to run |
| * the background object on empty tiles. |
| */ |
| if (hw_render->color_init_count && |
| index_list_clear_mask != ((1u << hw_render->color_init_count) - 1u)) { |
| info->process_empty_tiles = true; |
| } else { |
| info->process_empty_tiles = false; |
| } |
| |
| if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) { |
| uint32_t ds_index_list = 0; |
| |
| pvr_perform_start_of_render_attachment_clear(cmd_buffer, |
| framebuffer, |
| 0, |
| true, |
| &ds_index_list); |
| } |
| |
| if (index_list_clear_mask) |
| pvr_finishme("Add support for generating loadops shaders!"); |
| } |
| |
| static void pvr_stash_depth_format(struct pvr_cmd_buffer_state *state, |
| struct pvr_sub_cmd_gfx *const sub_cmd) |
| { |
| const struct pvr_render_pass *pass = state->render_pass_info.pass; |
| const struct pvr_renderpass_hwsetup_render *hw_render = |
| &pass->hw_setup->renders[sub_cmd->hw_render_idx]; |
| |
| if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) { |
| struct pvr_image_view **iviews = state->render_pass_info.attachments; |
| |
| state->depth_format = iviews[hw_render->ds_attach_idx]->vk.format; |
| } |
| } |
| |
| static bool pvr_loadops_contain_clear(struct pvr_renderpass_hwsetup *hw_setup) |
| { |
| for (uint32_t i = 0; i < hw_setup->render_count; i++) { |
| struct pvr_renderpass_hwsetup_render *hw_render = &hw_setup->renders[i]; |
| uint32_t render_targets_count = hw_render->init_setup.num_render_targets; |
| |
| for (uint32_t j = 0; |
| j < (hw_render->color_init_count * render_targets_count); |
| j += render_targets_count) { |
| for (uint32_t k = 0; k < hw_render->init_setup.num_render_targets; |
| k++) { |
| if (hw_render->color_init[j + k].op == |
| VK_ATTACHMENT_LOAD_OP_CLEAR) { |
| return true; |
| } |
| } |
| } |
| if (hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_CLEAR || |
| hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_CLEAR) { |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| |
| static VkResult |
| pvr_cmd_buffer_set_clear_values(struct pvr_cmd_buffer *cmd_buffer, |
| const VkRenderPassBeginInfo *pRenderPassBegin) |
| { |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| |
| /* Free any previously allocated clear values. */ |
| vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.clear_values); |
| |
| if (pRenderPassBegin->clearValueCount) { |
| const size_t size = pRenderPassBegin->clearValueCount * |
| sizeof(*state->render_pass_info.clear_values); |
| |
| state->render_pass_info.clear_values = |
| vk_zalloc(&cmd_buffer->vk.pool->alloc, |
| size, |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); |
| if (!state->render_pass_info.clear_values) { |
| state->status = vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY); |
| return state->status; |
| } |
| |
| memcpy(state->render_pass_info.clear_values, |
| pRenderPassBegin->pClearValues, |
| size); |
| } else { |
| state->render_pass_info.clear_values = NULL; |
| } |
| |
| state->render_pass_info.clear_value_count = |
| pRenderPassBegin->clearValueCount; |
| |
| return VK_SUCCESS; |
| } |
| |
| static bool |
| pvr_is_large_clear_required(const struct pvr_cmd_buffer *const cmd_buffer) |
| { |
| const struct pvr_device_info *const dev_info = |
| &cmd_buffer->device->pdevice->dev_info; |
| const VkRect2D render_area = cmd_buffer->state.render_pass_info.render_area; |
| const uint32_t vf_max_x = rogue_get_param_vf_max_x(dev_info); |
| const uint32_t vf_max_y = rogue_get_param_vf_max_x(dev_info); |
| |
| return (render_area.extent.width > (vf_max_x / 2) - 1) || |
| (render_area.extent.height > (vf_max_y / 2) - 1); |
| } |
| |
| static void pvr_emit_clear_words(struct pvr_cmd_buffer *const cmd_buffer, |
| struct pvr_sub_cmd_gfx *const sub_cmd) |
| { |
| struct pvr_csb *csb = &sub_cmd->control_stream; |
| struct pvr_device *device = cmd_buffer->device; |
| uint32_t *stream; |
| |
| stream = pvr_csb_alloc_dwords(csb, PVR_CLEAR_VDM_STATE_DWORD_COUNT); |
| if (!stream) { |
| cmd_buffer->state.status = VK_ERROR_OUT_OF_HOST_MEMORY; |
| return; |
| } |
| |
| if (pvr_is_large_clear_required(cmd_buffer)) { |
| memcpy(stream, |
| device->static_clear_state.large_clear_vdm_words, |
| sizeof(device->static_clear_state.large_clear_vdm_words)); |
| } else { |
| memcpy(stream, |
| device->static_clear_state.vdm_words, |
| sizeof(device->static_clear_state.vdm_words)); |
| } |
| } |
| |
| static VkResult pvr_cs_write_load_op(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd_gfx *sub_cmd, |
| struct pvr_load_op *load_op, |
| uint32_t isp_userpass) |
| { |
| const struct pvr_device *device = cmd_buffer->device; |
| struct pvr_static_clear_ppp_template template = |
| device->static_clear_state.ppp_templates[PVR_STATIC_CLEAR_COLOR_BIT]; |
| uint32_t pds_state[PVR_STATIC_CLEAR_PDS_STATE_COUNT]; |
| struct pvr_pds_upload shareds_update_program; |
| struct pvr_bo *pvr_bo; |
| VkResult result; |
| |
| result = pvr_load_op_data_create_and_upload(cmd_buffer, |
| load_op, |
| &shareds_update_program); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| template.config.ispctl.upass = isp_userpass; |
| |
| /* It might look odd that we aren't specifying the code segment's |
| * address anywhere. This is because the hardware always assumes that the |
| * data size is 2 128bit words and the code segments starts after that. |
| */ |
| pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SHADERBASE], |
| TA_STATE_PDS_SHADERBASE, |
| shaderbase) { |
| shaderbase.addr = PVR_DEV_ADDR(load_op->pds_frag_prog.data_offset); |
| } |
| |
| pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_TEXUNICODEBASE], |
| TA_STATE_PDS_TEXUNICODEBASE, |
| texunicodebase) { |
| texunicodebase.addr = |
| PVR_DEV_ADDR(load_op->pds_tex_state_prog.code_offset); |
| } |
| |
| pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SIZEINFO1], |
| TA_STATE_PDS_SIZEINFO1, |
| sizeinfo1) { |
| /* Dummy coefficient loading program. */ |
| sizeinfo1.pds_varyingsize = 0; |
| |
| sizeinfo1.pds_texturestatesize = DIV_ROUND_UP( |
| shareds_update_program.data_size, |
| PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEXTURESTATESIZE_UNIT_SIZE)); |
| |
| sizeinfo1.pds_tempsize = |
| DIV_ROUND_UP(load_op->temps_count, |
| PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE)); |
| } |
| |
| pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SIZEINFO2], |
| TA_STATE_PDS_SIZEINFO2, |
| sizeinfo2) { |
| sizeinfo2.usc_sharedsize = |
| DIV_ROUND_UP(load_op->const_shareds_count, |
| PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE)); |
| } |
| |
| /* Dummy coefficient loading program. */ |
| pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_VARYINGBASE] = 0; |
| |
| pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_TEXTUREDATABASE], |
| TA_STATE_PDS_TEXTUREDATABASE, |
| texturedatabase) { |
| texturedatabase.addr = PVR_DEV_ADDR(shareds_update_program.data_offset); |
| } |
| |
| template.config.pds_state = &pds_state; |
| |
| pvr_emit_ppp_from_template(&sub_cmd->control_stream, &template, &pvr_bo); |
| list_add(&pvr_bo->link, &cmd_buffer->bo_list); |
| |
| pvr_emit_clear_words(cmd_buffer, sub_cmd); |
| |
| pvr_reset_graphics_dirty_state(&cmd_buffer->state, false); |
| |
| return VK_SUCCESS; |
| } |
| |
| void pvr_CmdBeginRenderPass2(VkCommandBuffer commandBuffer, |
| const VkRenderPassBeginInfo *pRenderPassBeginInfo, |
| const VkSubpassBeginInfo *pSubpassBeginInfo) |
| { |
| PVR_FROM_HANDLE(pvr_framebuffer, |
| framebuffer, |
| pRenderPassBeginInfo->framebuffer); |
| PVR_FROM_HANDLE(pvr_render_pass, pass, pRenderPassBeginInfo->renderPass); |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| const struct pvr_renderpass_hwsetup_subpass *hw_subpass; |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| assert(!state->render_pass_info.pass); |
| assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); |
| |
| /* FIXME: Create a separate function for everything using pass->subpasses, |
| * look at cmd_buffer_begin_subpass() for example. */ |
| state->render_pass_info.pass = pass; |
| state->render_pass_info.framebuffer = framebuffer; |
| state->render_pass_info.subpass_idx = 0; |
| state->render_pass_info.render_area = pRenderPassBeginInfo->renderArea; |
| state->render_pass_info.current_hw_subpass = 0; |
| state->render_pass_info.pipeline_bind_point = |
| pass->subpasses[0].pipeline_bind_point; |
| state->render_pass_info.isp_userpass = pass->subpasses[0].isp_userpass; |
| state->dirty.isp_userpass = true; |
| |
| result = pvr_cmd_buffer_setup_attachments(cmd_buffer, pass, framebuffer); |
| if (result != VK_SUCCESS) |
| return; |
| |
| state->status = |
| pvr_init_render_targets(cmd_buffer->device, pass, framebuffer); |
| if (state->status != VK_SUCCESS) |
| return; |
| |
| result = pvr_cmd_buffer_set_clear_values(cmd_buffer, pRenderPassBeginInfo); |
| if (result != VK_SUCCESS) |
| return; |
| |
| assert(pass->subpasses[0].pipeline_bind_point == |
| VK_PIPELINE_BIND_POINT_GRAPHICS); |
| |
| result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS); |
| if (result != VK_SUCCESS) |
| return; |
| |
| /* Run subpass 0 "soft" background object after the actual background |
| * object. |
| */ |
| hw_subpass = pvr_get_hw_subpass(pass, 0); |
| if (hw_subpass->load_op) { |
| result = pvr_cs_write_load_op(cmd_buffer, |
| &cmd_buffer->state.current_sub_cmd->gfx, |
| hw_subpass->load_op, |
| 0); |
| if (result != VK_SUCCESS) |
| return; |
| } |
| |
| pvr_perform_start_of_render_clears(cmd_buffer); |
| pvr_stash_depth_format(&cmd_buffer->state, |
| &cmd_buffer->state.current_sub_cmd->gfx); |
| |
| if (!pvr_loadops_contain_clear(pass->hw_setup)) { |
| state->dynamic.scissor_accum_state = PVR_SCISSOR_ACCUM_CHECK_FOR_CLEAR; |
| state->dynamic.scissor_accum_bounds.offset.x = 0; |
| state->dynamic.scissor_accum_bounds.offset.y = 0; |
| state->dynamic.scissor_accum_bounds.extent.width = 0; |
| state->dynamic.scissor_accum_bounds.extent.height = 0; |
| } else { |
| state->dynamic.scissor_accum_state = PVR_SCISSOR_ACCUM_DISABLED; |
| } |
| } |
| |
| VkResult pvr_BeginCommandBuffer(VkCommandBuffer commandBuffer, |
| const VkCommandBufferBeginInfo *pBeginInfo) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *state; |
| VkResult result; |
| |
| pvr_cmd_buffer_reset(cmd_buffer); |
| |
| cmd_buffer->usage_flags = pBeginInfo->flags; |
| state = &cmd_buffer->state; |
| |
| /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for |
| * primary level command buffers. |
| * |
| * From the Vulkan 1.0 spec: |
| * |
| * VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a |
| * secondary command buffer is considered to be entirely inside a render |
| * pass. If this is a primary command buffer, then this bit is ignored. |
| */ |
| if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { |
| cmd_buffer->usage_flags &= |
| ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT; |
| } |
| |
| if (cmd_buffer->usage_flags & |
| VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { |
| const VkCommandBufferInheritanceInfo *inheritance_info = |
| pBeginInfo->pInheritanceInfo; |
| struct pvr_render_pass *pass; |
| |
| pass = pvr_render_pass_from_handle(inheritance_info->renderPass); |
| state->render_pass_info.pass = pass; |
| state->render_pass_info.framebuffer = |
| pvr_framebuffer_from_handle(inheritance_info->framebuffer); |
| state->render_pass_info.subpass_idx = inheritance_info->subpass; |
| state->render_pass_info.isp_userpass = |
| pass->subpasses[inheritance_info->subpass].isp_userpass; |
| |
| result = |
| pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS); |
| if (result != VK_SUCCESS) |
| return result; |
| } |
| |
| memset(state->barriers_needed, |
| 0xFF, |
| sizeof(*state->barriers_needed) * ARRAY_SIZE(state->barriers_needed)); |
| |
| cmd_buffer->status = PVR_CMD_BUFFER_STATUS_RECORDING; |
| |
| return VK_SUCCESS; |
| } |
| |
| VkResult pvr_ResetCommandBuffer(VkCommandBuffer commandBuffer, |
| VkCommandBufferResetFlags flags) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| |
| pvr_cmd_buffer_reset(cmd_buffer); |
| |
| return VK_SUCCESS; |
| } |
| |
| VkResult pvr_cmd_buffer_add_transfer_cmd(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_transfer_cmd *transfer_cmd) |
| { |
| struct pvr_sub_cmd_transfer *sub_cmd; |
| VkResult result; |
| |
| result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| sub_cmd = &cmd_buffer->state.current_sub_cmd->transfer; |
| |
| list_addtail(&transfer_cmd->link, &sub_cmd->transfer_cmds); |
| |
| return VK_SUCCESS; |
| } |
| |
| #define PVR_WRITE(_buffer, _value, _offset, _max) \ |
| do { \ |
| __typeof__(_value) __value = _value; \ |
| uint64_t __offset = _offset; \ |
| uint32_t __nr_dwords = sizeof(__value) / sizeof(uint32_t); \ |
| static_assert(__same_type(*_buffer, __value), \ |
| "Buffer and value type mismatch"); \ |
| assert((__offset + __nr_dwords) <= (_max)); \ |
| assert((__offset % __nr_dwords) == 0U); \ |
| _buffer[__offset / __nr_dwords] = __value; \ |
| } while (0) |
| |
| static VkResult |
| pvr_setup_vertex_buffers(struct pvr_cmd_buffer *cmd_buffer, |
| const struct pvr_graphics_pipeline *const gfx_pipeline) |
| { |
| const struct pvr_vertex_shader_state *const vertex_state = |
| &gfx_pipeline->vertex_shader_state; |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| const struct pvr_pds_info *const pds_info = state->pds_shader.info; |
| const uint8_t *entries; |
| uint32_t *dword_buffer; |
| uint64_t *qword_buffer; |
| struct pvr_bo *pvr_bo; |
| VkResult result; |
| |
| result = pvr_cmd_buffer_alloc_mem(cmd_buffer, |
| cmd_buffer->device->heaps.pds_heap, |
| pds_info->data_size_in_dwords << 2, |
| PVR_BO_ALLOC_FLAG_CPU_MAPPED, |
| &pvr_bo); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| dword_buffer = (uint32_t *)pvr_bo->bo->map; |
| qword_buffer = (uint64_t *)pvr_bo->bo->map; |
| |
| entries = (uint8_t *)pds_info->entries; |
| |
| for (uint32_t i = 0; i < pds_info->entry_count; i++) { |
| const struct pvr_const_map_entry *const entry_header = |
| (struct pvr_const_map_entry *)entries; |
| |
| switch (entry_header->type) { |
| case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: { |
| const struct pvr_const_map_entry_literal32 *const literal = |
| (struct pvr_const_map_entry_literal32 *)entries; |
| |
| PVR_WRITE(dword_buffer, |
| literal->literal_value, |
| literal->const_offset, |
| pds_info->data_size_in_dwords); |
| |
| entries += sizeof(*literal); |
| break; |
| } |
| |
| case PVR_PDS_CONST_MAP_ENTRY_TYPE_DOUTU_ADDRESS: { |
| const struct pvr_const_map_entry_doutu_address *const doutu_addr = |
| (struct pvr_const_map_entry_doutu_address *)entries; |
| const pvr_dev_addr_t exec_addr = |
| PVR_DEV_ADDR_OFFSET(vertex_state->bo->vma->dev_addr, |
| vertex_state->entry_offset); |
| uint64_t addr = 0ULL; |
| |
| pvr_set_usc_execution_address64(&addr, exec_addr.addr); |
| |
| PVR_WRITE(qword_buffer, |
| addr | doutu_addr->doutu_control, |
| doutu_addr->const_offset, |
| pds_info->data_size_in_dwords); |
| |
| entries += sizeof(*doutu_addr); |
| break; |
| } |
| |
| case PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE: { |
| const struct pvr_const_map_entry_base_instance *const base_instance = |
| (struct pvr_const_map_entry_base_instance *)entries; |
| |
| PVR_WRITE(dword_buffer, |
| state->draw_state.base_instance, |
| base_instance->const_offset, |
| pds_info->data_size_in_dwords); |
| |
| entries += sizeof(*base_instance); |
| break; |
| } |
| |
| case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_ADDRESS: { |
| const struct pvr_const_map_entry_vertex_attribute_address |
| *const attribute = |
| (struct pvr_const_map_entry_vertex_attribute_address *)entries; |
| const struct pvr_vertex_binding *const binding = |
| &state->vertex_bindings[attribute->binding_index]; |
| const pvr_dev_addr_t addr = |
| PVR_DEV_ADDR_OFFSET(binding->buffer->dev_addr, |
| binding->offset + attribute->offset); |
| |
| PVR_WRITE(qword_buffer, |
| addr.addr, |
| attribute->const_offset, |
| pds_info->data_size_in_dwords); |
| |
| entries += sizeof(*attribute); |
| break; |
| } |
| |
| default: |
| unreachable("Unsupported data section map"); |
| break; |
| } |
| } |
| |
| state->pds_vertex_attrib_offset = |
| pvr_bo->vma->dev_addr.addr - |
| cmd_buffer->device->heaps.pds_heap->base_addr.addr; |
| |
| pvr_bo_cpu_unmap(cmd_buffer->device, pvr_bo); |
| |
| return VK_SUCCESS; |
| } |
| |
| static VkResult pvr_setup_descriptor_mappings( |
| struct pvr_cmd_buffer *const cmd_buffer, |
| enum pvr_stage_allocation stage, |
| const struct pvr_stage_allocation_descriptor_state *descriptor_state, |
| const pvr_dev_addr_t *const num_worgroups_buff_addr, |
| uint32_t *const descriptor_data_offset_out) |
| { |
| const struct pvr_pds_info *const pds_info = &descriptor_state->pds_info; |
| const struct pvr_descriptor_state *desc_state; |
| const uint8_t *entries; |
| uint32_t *dword_buffer; |
| uint64_t *qword_buffer; |
| struct pvr_bo *pvr_bo; |
| VkResult result; |
| |
| if (!pds_info->data_size_in_dwords) |
| return VK_SUCCESS; |
| |
| result = pvr_cmd_buffer_alloc_mem(cmd_buffer, |
| cmd_buffer->device->heaps.pds_heap, |
| pds_info->data_size_in_dwords << 2, |
| PVR_BO_ALLOC_FLAG_CPU_MAPPED, |
| &pvr_bo); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| dword_buffer = (uint32_t *)pvr_bo->bo->map; |
| qword_buffer = (uint64_t *)pvr_bo->bo->map; |
| |
| entries = (uint8_t *)pds_info->entries; |
| |
| switch (stage) { |
| case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY: |
| case PVR_STAGE_ALLOCATION_FRAGMENT: |
| desc_state = &cmd_buffer->state.gfx_desc_state; |
| break; |
| |
| case PVR_STAGE_ALLOCATION_COMPUTE: |
| desc_state = &cmd_buffer->state.compute_desc_state; |
| break; |
| |
| default: |
| unreachable("Unsupported stage."); |
| break; |
| } |
| |
| for (uint32_t i = 0; i < pds_info->entry_count; i++) { |
| const struct pvr_const_map_entry *const entry_header = |
| (struct pvr_const_map_entry *)entries; |
| |
| /* TODO: See if instead of reusing the blend constant buffer type entry, |
| * we can setup a new buffer type specifically for num_workgroups or other |
| * built-in variables. The mappings are setup at pipeline creation when |
| * creating the descriptor program. |
| */ |
| pvr_finishme("Handle blend constant reuse for compute."); |
| |
| switch (entry_header->type) { |
| case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: { |
| const struct pvr_const_map_entry_literal32 *const literal = |
| (struct pvr_const_map_entry_literal32 *)entries; |
| |
| PVR_WRITE(dword_buffer, |
| literal->literal_value, |
| literal->const_offset, |
| pds_info->data_size_in_dwords); |
| |
| entries += sizeof(*literal); |
| break; |
| } |
| |
| case PVR_PDS_CONST_MAP_ENTRY_TYPE_CONSTANT_BUFFER: { |
| const struct pvr_const_map_entry_constant_buffer *const_buffer_entry = |
| (struct pvr_const_map_entry_constant_buffer *)entries; |
| const uint32_t desc_set = const_buffer_entry->desc_set; |
| const uint32_t binding = const_buffer_entry->binding; |
| const struct pvr_descriptor_set *descriptor_set; |
| const struct pvr_descriptor *descriptor; |
| pvr_dev_addr_t buffer_addr; |
| |
| assert(desc_set < PVR_MAX_DESCRIPTOR_SETS); |
| descriptor_set = desc_state->descriptor_sets[desc_set]; |
| |
| /* TODO: Handle dynamic buffers. */ |
| descriptor = &descriptor_set->descriptors[binding]; |
| assert(descriptor->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER); |
| |
| assert(descriptor->buffer_desc_range == |
| const_buffer_entry->size_in_dwords * sizeof(uint32_t)); |
| assert(descriptor->buffer_create_info_size == |
| const_buffer_entry->size_in_dwords * sizeof(uint32_t)); |
| |
| buffer_addr = |
| PVR_DEV_ADDR_OFFSET(descriptor->buffer_dev_addr, |
| const_buffer_entry->offset * sizeof(uint32_t)); |
| |
| PVR_WRITE(qword_buffer, |
| buffer_addr.addr, |
| const_buffer_entry->const_offset, |
| pds_info->data_size_in_dwords); |
| |
| entries += sizeof(*const_buffer_entry); |
| break; |
| } |
| |
| case PVR_PDS_CONST_MAP_ENTRY_TYPE_DESCRIPTOR_SET: { |
| const struct pvr_const_map_entry_descriptor_set *desc_set_entry = |
| (struct pvr_const_map_entry_descriptor_set *)entries; |
| const uint32_t desc_set_num = desc_set_entry->descriptor_set; |
| const struct pvr_descriptor_set *descriptor_set; |
| pvr_dev_addr_t desc_set_addr; |
| |
| assert(desc_set_num < PVR_MAX_DESCRIPTOR_SETS); |
| |
| /* TODO: Remove this when the compiler provides us with usage info? |
| */ |
| /* We skip DMAing unbound descriptor sets. */ |
| if (!(desc_state->valid_mask & BITFIELD_BIT(desc_set_num))) { |
| const struct pvr_const_map_entry_literal32 *literal; |
| uint32_t zero_literal_value; |
| |
| entries += sizeof(*desc_set_entry); |
| literal = (struct pvr_const_map_entry_literal32 *)entries; |
| |
| /* TODO: Is there any guarantee that a literal will follow the |
| * descriptor set entry? |
| */ |
| assert(literal->type == PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32); |
| |
| /* We zero out the DMA size so the DMA isn't performed. */ |
| zero_literal_value = |
| literal->literal_value & |
| PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_BSIZE_CLRMSK; |
| |
| PVR_WRITE(qword_buffer, |
| UINT64_C(0), |
| desc_set_entry->const_offset, |
| pds_info->data_size_in_dwords); |
| |
| PVR_WRITE(dword_buffer, |
| zero_literal_value, |
| desc_set_entry->const_offset, |
| pds_info->data_size_in_dwords); |
| |
| entries += sizeof(*literal); |
| i++; |
| continue; |
| } |
| |
| descriptor_set = desc_state->descriptor_sets[desc_set_num]; |
| |
| desc_set_addr = descriptor_set->pvr_bo->vma->dev_addr; |
| |
| if (desc_set_entry->primary) { |
| desc_set_addr = PVR_DEV_ADDR_OFFSET( |
| desc_set_addr, |
| descriptor_set->layout->memory_layout_in_dwords_per_stage[stage] |
| .primary_offset |
| << 2U); |
| } else { |
| desc_set_addr = PVR_DEV_ADDR_OFFSET( |
| desc_set_addr, |
| descriptor_set->layout->memory_layout_in_dwords_per_stage[stage] |
| .secondary_offset |
| << 2U); |
| } |
| |
| desc_set_addr = PVR_DEV_ADDR_OFFSET( |
| desc_set_addr, |
| (uint64_t)desc_set_entry->offset_in_dwords << 2U); |
| |
| PVR_WRITE(qword_buffer, |
| desc_set_addr.addr, |
| desc_set_entry->const_offset, |
| pds_info->data_size_in_dwords); |
| |
| entries += sizeof(*desc_set_entry); |
| break; |
| } |
| |
| case PVR_PDS_CONST_MAP_ENTRY_TYPE_SPECIAL_BUFFER: { |
| const struct pvr_const_map_entry_special_buffer *special_buff_entry = |
| (struct pvr_const_map_entry_special_buffer *)entries; |
| |
| switch (special_buff_entry->buffer_type) { |
| case PVR_BUFFER_TYPE_COMPILE_TIME: { |
| uint64_t addr = descriptor_state->static_consts->vma->dev_addr.addr; |
| |
| PVR_WRITE(qword_buffer, |
| addr, |
| special_buff_entry->const_offset, |
| pds_info->data_size_in_dwords); |
| break; |
| } |
| |
| case PVR_BUFFER_TYPE_BLEND_CONSTS: |
| if (stage == PVR_STAGE_ALLOCATION_COMPUTE) { |
| assert(num_worgroups_buff_addr->addr); |
| |
| /* TODO: Check if we need to offset this (e.g. for just y and z), |
| * or cope with any reordering? |
| */ |
| PVR_WRITE(qword_buffer, |
| num_worgroups_buff_addr->addr, |
| special_buff_entry->const_offset, |
| pds_info->data_size_in_dwords); |
| } else { |
| pvr_finishme("Add blend constants support."); |
| } |
| break; |
| |
| default: |
| unreachable("Unsupported special buffer type."); |
| } |
| |
| entries += sizeof(*special_buff_entry); |
| break; |
| } |
| |
| default: |
| unreachable("Unsupported map entry type."); |
| } |
| } |
| |
| pvr_bo_cpu_unmap(cmd_buffer->device, pvr_bo); |
| |
| *descriptor_data_offset_out = |
| pvr_bo->vma->dev_addr.addr - |
| cmd_buffer->device->heaps.pds_heap->base_addr.addr; |
| |
| return VK_SUCCESS; |
| } |
| |
| #undef PVR_WRITE |
| |
| static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd_compute *const sub_cmd) |
| { |
| const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice; |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| struct pvr_csb *csb = &sub_cmd->control_stream; |
| const struct pvr_compute_pipeline *pipeline = state->compute_pipeline; |
| const uint32_t const_shared_reg_count = |
| pipeline->state.shader.const_shared_reg_count; |
| struct pvr_compute_kernel_info info; |
| |
| /* No shared regs, no need to use an allocation kernel. */ |
| if (!const_shared_reg_count) |
| return; |
| |
| info = (struct pvr_compute_kernel_info){ |
| .indirect_buffer_addr = PVR_DEV_ADDR_INVALID, |
| .sd_type = PVRX(CDMCTRL_SD_TYPE_NONE), |
| |
| .usc_target = PVRX(CDMCTRL_USC_TARGET_ALL), |
| .usc_common_shared = true, |
| .usc_common_size = |
| DIV_ROUND_UP(const_shared_reg_count, |
| PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)), |
| |
| .local_size = { 1, 1, 1 }, |
| .global_size = { 1, 1, 1 }, |
| }; |
| |
| /* Sometimes we don't have a secondary program if there were no constants to |
| * write, but we still need to run a PDS program to accomplish the |
| * allocation of the local/common store shared registers so we repurpose the |
| * deallocation PDS program. |
| */ |
| if (pipeline->state.descriptor.pds_info.code_size_in_dwords) { |
| uint32_t pds_data_size_in_dwords = |
| pipeline->state.descriptor.pds_info.data_size_in_dwords; |
| |
| info.pds_data_offset = state->pds_compute_descriptor_data_offset; |
| info.pds_data_size = |
| DIV_ROUND_UP(pds_data_size_in_dwords << 2U, |
| PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)); |
| |
| /* Check that we have upload the code section. */ |
| assert(pipeline->state.descriptor.pds_code.code_size); |
| info.pds_code_offset = pipeline->state.descriptor.pds_code.code_offset; |
| } else { |
| /* FIXME: There should be a deallocation pds program already uploaded |
| * that we use at this point. |
| */ |
| assert(!"Unimplemented"); |
| } |
| |
| /* We don't need to pad the workgroup size. */ |
| |
| info.max_instances = |
| pvr_compute_flat_slot_size(pdevice, const_shared_reg_count, false, 1U); |
| |
| pvr_compute_generate_control_stream(csb, sub_cmd, &info); |
| } |
| |
| static uint32_t |
| pvr_compute_flat_pad_workgroup_size(const struct pvr_physical_device *pdevice, |
| uint32_t workgroup_size, |
| uint32_t coeff_regs_count) |
| { |
| const struct pvr_device_runtime_info *dev_runtime_info = |
| &pdevice->dev_runtime_info; |
| const struct pvr_device_info *dev_info = &pdevice->dev_info; |
| uint32_t max_avail_coeff_regs = |
| dev_runtime_info->cdm_max_local_mem_size_regs; |
| uint32_t coeff_regs_count_aligned = |
| ALIGN_POT(coeff_regs_count, |
| PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE) >> 2U); |
| |
| /* If the work group size is > ROGUE_MAX_INSTANCES_PER_TASK. We now *always* |
| * pad the work group size to the next multiple of |
| * ROGUE_MAX_INSTANCES_PER_TASK. |
| * |
| * If we use more than 1/8th of the max coefficient registers then we round |
| * work group size up to the next multiple of ROGUE_MAX_INSTANCES_PER_TASK |
| */ |
| /* TODO: See if this can be optimized. */ |
| if (workgroup_size > ROGUE_MAX_INSTANCES_PER_TASK || |
| coeff_regs_count_aligned > (max_avail_coeff_regs / 8)) { |
| assert(workgroup_size < rogue_get_compute_max_work_group_size(dev_info)); |
| |
| return ALIGN_POT(workgroup_size, ROGUE_MAX_INSTANCES_PER_TASK); |
| } |
| |
| return workgroup_size; |
| } |
| |
| /* TODO: Wire up the base_workgroup variant program when implementing |
| * VK_KHR_device_group. The values will also need patching into the program. |
| */ |
| static void pvr_compute_update_kernel( |
| struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd_compute *const sub_cmd, |
| pvr_dev_addr_t indirect_addr, |
| const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS]) |
| { |
| const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice; |
| const struct pvr_device_runtime_info *dev_runtime_info = |
| &pdevice->dev_runtime_info; |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| struct pvr_csb *csb = &sub_cmd->control_stream; |
| const struct pvr_compute_pipeline *pipeline = state->compute_pipeline; |
| const struct pvr_pds_info *program_info = |
| &pipeline->state.primary_program_info; |
| |
| struct pvr_compute_kernel_info info = { |
| .indirect_buffer_addr = indirect_addr, |
| .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY), |
| .pds_temp_size = |
| DIV_ROUND_UP(program_info->temps_required << 2U, |
| PVRX(CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE)), |
| |
| .pds_data_size = |
| DIV_ROUND_UP(program_info->data_size_in_dwords << 2U, |
| PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)), |
| .pds_data_offset = pipeline->state.primary_program.data_offset, |
| .pds_code_offset = pipeline->state.primary_program.code_offset, |
| |
| .sd_type = PVRX(CDMCTRL_SD_TYPE_USC), |
| |
| .usc_unified_size = |
| DIV_ROUND_UP(pipeline->state.shader.input_register_count << 2U, |
| PVRX(CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE)), |
| |
| /* clang-format off */ |
| .global_size = { |
| global_workgroup_size[0], |
| global_workgroup_size[1], |
| global_workgroup_size[2] |
| }, |
| /* clang-format on */ |
| }; |
| |
| uint32_t work_size = pipeline->state.shader.work_size; |
| uint32_t coeff_regs; |
| |
| if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) { |
| /* Enforce a single workgroup per cluster through allocation starvation. |
| */ |
| coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs; |
| } else { |
| coeff_regs = pipeline->state.shader.coefficient_register_count; |
| } |
| |
| info.usc_common_size = |
| DIV_ROUND_UP(coeff_regs << 2U, |
| PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)); |
| |
| /* Use a whole slot per workgroup. */ |
| work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK); |
| |
| coeff_regs += pipeline->state.shader.const_shared_reg_count; |
| |
| work_size = |
| pvr_compute_flat_pad_workgroup_size(pdevice, work_size, coeff_regs); |
| |
| info.local_size[0] = work_size; |
| info.local_size[1] = 1U; |
| info.local_size[2] = 1U; |
| |
| info.max_instances = |
| pvr_compute_flat_slot_size(pdevice, coeff_regs, false, work_size); |
| |
| pvr_compute_generate_control_stream(csb, sub_cmd, &info); |
| } |
| |
| void pvr_CmdDispatch(VkCommandBuffer commandBuffer, |
| uint32_t groupCountX, |
| uint32_t groupCountY, |
| uint32_t groupCountZ) |
| { |
| const uint32_t workgroup_size[] = { groupCountX, groupCountY, groupCountZ }; |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| const struct pvr_compute_pipeline *compute_pipeline = |
| state->compute_pipeline; |
| const VkShaderStageFlags push_consts_stage_mask = |
| compute_pipeline->base.layout->push_constants_shader_stages; |
| struct pvr_sub_cmd_compute *sub_cmd; |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| if (!groupCountX || !groupCountY || !groupCountZ) |
| return; |
| |
| pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_COMPUTE); |
| |
| sub_cmd = &state->current_sub_cmd->compute; |
| |
| sub_cmd->uses_atomic_ops |= compute_pipeline->state.shader.uses_atomic_ops; |
| sub_cmd->uses_barrier |= compute_pipeline->state.shader.uses_barrier; |
| |
| if (push_consts_stage_mask & VK_SHADER_STAGE_COMPUTE_BIT) { |
| /* TODO: Add a dirty push constants mask in the cmd_buffer state and |
| * check for dirty compute stage. |
| */ |
| pvr_finishme("Add support for push constants."); |
| } |
| |
| if (compute_pipeline->state.shader.uses_num_workgroups) { |
| struct pvr_bo *num_workgroups_bo; |
| |
| result = pvr_cmd_buffer_upload_general(cmd_buffer, |
| workgroup_size, |
| sizeof(workgroup_size), |
| &num_workgroups_bo); |
| if (result != VK_SUCCESS) |
| return; |
| |
| result = pvr_setup_descriptor_mappings( |
| cmd_buffer, |
| PVR_STAGE_ALLOCATION_COMPUTE, |
| &compute_pipeline->state.descriptor, |
| &num_workgroups_bo->vma->dev_addr, |
| &state->pds_compute_descriptor_data_offset); |
| if (result != VK_SUCCESS) |
| return; |
| } else if ((compute_pipeline->base.layout |
| ->per_stage_descriptor_masks[PVR_STAGE_ALLOCATION_COMPUTE] && |
| state->dirty.compute_desc_dirty) || |
| state->dirty.compute_pipeline_binding) { |
| result = pvr_setup_descriptor_mappings( |
| cmd_buffer, |
| PVR_STAGE_ALLOCATION_COMPUTE, |
| &compute_pipeline->state.descriptor, |
| NULL, |
| &state->pds_compute_descriptor_data_offset); |
| if (result != VK_SUCCESS) |
| return; |
| } |
| |
| pvr_compute_update_shared(cmd_buffer, sub_cmd); |
| |
| pvr_compute_update_kernel(cmd_buffer, |
| sub_cmd, |
| PVR_DEV_ADDR_INVALID, |
| workgroup_size); |
| } |
| |
| void pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer, |
| VkBuffer _buffer, |
| VkDeviceSize offset) |
| { |
| const uint32_t workgroup_size[PVR_WORKGROUP_DIMENSIONS] = { 1, 1, 1 }; |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| const struct pvr_compute_pipeline *compute_pipeline = |
| state->compute_pipeline; |
| const VkShaderStageFlags push_consts_stage_mask = |
| compute_pipeline->base.layout->push_constants_shader_stages; |
| PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer); |
| struct pvr_sub_cmd_compute *sub_cmd; |
| pvr_dev_addr_t indirect_addr; |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| indirect_addr = PVR_DEV_ADDR_OFFSET(buffer->dev_addr, offset); |
| |
| pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_COMPUTE); |
| |
| sub_cmd = &state->current_sub_cmd->compute; |
| sub_cmd->uses_atomic_ops |= compute_pipeline->state.shader.uses_atomic_ops; |
| sub_cmd->uses_barrier |= compute_pipeline->state.shader.uses_barrier; |
| |
| if (push_consts_stage_mask & VK_SHADER_STAGE_COMPUTE_BIT) { |
| /* TODO: Add a dirty push constants mask in the cmd_buffer state and |
| * check for dirty compute stage. |
| */ |
| pvr_finishme("Add support for push constants."); |
| } |
| |
| if (compute_pipeline->state.shader.uses_num_workgroups) { |
| result = pvr_setup_descriptor_mappings( |
| cmd_buffer, |
| PVR_STAGE_ALLOCATION_COMPUTE, |
| &compute_pipeline->state.descriptor, |
| &indirect_addr, |
| &state->pds_compute_descriptor_data_offset); |
| if (result != VK_SUCCESS) |
| return; |
| } else if ((compute_pipeline->base.layout |
| ->per_stage_descriptor_masks[PVR_STAGE_ALLOCATION_COMPUTE] && |
| state->dirty.compute_desc_dirty) || |
| state->dirty.compute_pipeline_binding) { |
| result = pvr_setup_descriptor_mappings( |
| cmd_buffer, |
| PVR_STAGE_ALLOCATION_COMPUTE, |
| &compute_pipeline->state.descriptor, |
| NULL, |
| &state->pds_compute_descriptor_data_offset); |
| if (result != VK_SUCCESS) |
| return; |
| } |
| |
| pvr_compute_update_shared(cmd_buffer, sub_cmd); |
| |
| pvr_compute_update_kernel(cmd_buffer, sub_cmd, indirect_addr, workgroup_size); |
| } |
| |
| static void |
| pvr_update_draw_state(struct pvr_cmd_buffer_state *const state, |
| const struct pvr_cmd_buffer_draw_state *const draw_state) |
| { |
| /* We don't have a state to tell us that base_instance is being used so it |
| * gets used as a boolean - 0 means we'll use a pds program that skips the |
| * base instance addition. If the base_instance gets used (and the last |
| * draw's base_instance was 0) then we switch to the BASE_INSTANCE attrib |
| * program. |
| * |
| * If base_instance changes then we only need to update the data section. |
| * |
| * The only draw call state that doesn't really matter is the start vertex |
| * as that is handled properly in the VDM state in all cases. |
| */ |
| if ((state->draw_state.draw_indexed != draw_state->draw_indexed) || |
| (state->draw_state.draw_indirect != draw_state->draw_indirect) || |
| (state->draw_state.base_instance == 0 && |
| draw_state->base_instance != 0)) { |
| state->dirty.draw_variant = true; |
| } else if (state->draw_state.base_instance != draw_state->base_instance) { |
| state->dirty.draw_base_instance = true; |
| } |
| |
| state->draw_state = *draw_state; |
| } |
| |
| static uint32_t pvr_calc_shared_regs_count( |
| const struct pvr_graphics_pipeline *const gfx_pipeline) |
| { |
| const struct pvr_pipeline_stage_state *const vertex_state = |
| &gfx_pipeline->vertex_shader_state.stage_state; |
| uint32_t shared_regs = vertex_state->const_shared_reg_count + |
| vertex_state->const_shared_reg_offset; |
| |
| if (gfx_pipeline->fragment_shader_state.bo) { |
| const struct pvr_pipeline_stage_state *const fragment_state = |
| &gfx_pipeline->fragment_shader_state.stage_state; |
| uint32_t fragment_regs = fragment_state->const_shared_reg_count + |
| fragment_state->const_shared_reg_offset; |
| |
| shared_regs = MAX2(shared_regs, fragment_regs); |
| } |
| |
| return shared_regs; |
| } |
| |
| static void |
| pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer *const cmd_buffer, |
| struct pvr_sub_cmd_gfx *const sub_cmd, |
| const uint32_t pds_vertex_descriptor_data_offset) |
| { |
| const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| const struct pvr_stage_allocation_descriptor_state |
| *const vertex_descriptor_state = |
| &state->gfx_pipeline->vertex_shader_state.descriptor_state; |
| const struct pvr_pipeline_stage_state *const vertex_stage_state = |
| &state->gfx_pipeline->vertex_shader_state.stage_state; |
| struct pvr_csb *const csb = &sub_cmd->control_stream; |
| |
| if (!vertex_descriptor_state->pds_info.code_size_in_dwords) |
| return; |
| |
| pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) { |
| state0.usc_target = PVRX(VDMCTRL_USC_TARGET_ALL); |
| |
| state0.usc_common_size = |
| DIV_ROUND_UP(vertex_stage_state->const_shared_reg_count << 2, |
| PVRX(VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE)); |
| |
| state0.pds_data_size = DIV_ROUND_UP( |
| vertex_descriptor_state->pds_info.data_size_in_dwords << 2, |
| PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE)); |
| } |
| |
| pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) { |
| state1.pds_data_addr = PVR_DEV_ADDR(pds_vertex_descriptor_data_offset); |
| state1.sd_type = PVRX(VDMCTRL_SD_TYPE_NONE); |
| } |
| |
| pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) { |
| state2.pds_code_addr = |
| PVR_DEV_ADDR(vertex_descriptor_state->pds_code.code_offset); |
| } |
| } |
| |
| static void pvr_setup_output_select(struct pvr_cmd_buffer *const cmd_buffer) |
| { |
| struct PVRX(TA_STATE_HEADER) *const header = &cmd_buffer->state.emit_header; |
| const struct pvr_graphics_pipeline *const gfx_pipeline = |
| cmd_buffer->state.gfx_pipeline; |
| struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state; |
| const struct pvr_vertex_shader_state *const vertex_state = |
| &gfx_pipeline->vertex_shader_state; |
| uint32_t output_selects; |
| |
| /* TODO: Handle vertex and fragment shader state flags. */ |
| |
| pvr_csb_pack (&output_selects, TA_OUTPUT_SEL, state) { |
| const VkPrimitiveTopology topology = |
| gfx_pipeline->input_asm_state.topology; |
| |
| state.rhw_pres = true; |
| state.vtxsize = DIV_ROUND_UP(vertex_state->vertex_output_size, 4U); |
| state.psprite_size_pres = (topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST); |
| } |
| |
| if (ppp_state->output_selects != output_selects) { |
| ppp_state->output_selects = output_selects; |
| header->pres_outselects = true; |
| } |
| |
| if (ppp_state->varying_word[0] != vertex_state->varying[0]) { |
| ppp_state->varying_word[0] = vertex_state->varying[0]; |
| header->pres_varying_word0 = true; |
| } |
| |
| if (ppp_state->varying_word[1] != vertex_state->varying[1]) { |
| ppp_state->varying_word[1] = vertex_state->varying[1]; |
| header->pres_varying_word1 = true; |
| } |
| } |
| |
| static void |
| pvr_setup_isp_faces_and_control(struct pvr_cmd_buffer *const cmd_buffer, |
| struct PVRX(TA_STATE_ISPA) *const ispa_out) |
| { |
| struct PVRX(TA_STATE_HEADER) *const header = &cmd_buffer->state.emit_header; |
| const struct pvr_graphics_pipeline *const gfx_pipeline = |
| cmd_buffer->state.gfx_pipeline; |
| struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state; |
| const struct pvr_dynamic_state *const dynamic_state = |
| &cmd_buffer->state.dynamic.common; |
| const struct pvr_render_pass_info *const pass_info = |
| &cmd_buffer->state.render_pass_info; |
| const uint32_t subpass_idx = pass_info->subpass_idx; |
| const uint32_t *depth_stencil_attachment_idx = |
| pass_info->pass->subpasses[subpass_idx].depth_stencil_attachment; |
| const struct pvr_image_view *const attachment = |
| (!depth_stencil_attachment_idx) |
| ? NULL |
| : pass_info->attachments[*depth_stencil_attachment_idx]; |
| |
| const VkCullModeFlags cull_mode = gfx_pipeline->raster_state.cull_mode; |
| const bool raster_discard_enabled = |
| gfx_pipeline->raster_state.discard_enable; |
| const bool disable_all = raster_discard_enabled || !attachment; |
| |
| const VkPrimitiveTopology topology = gfx_pipeline->input_asm_state.topology; |
| const enum PVRX(TA_OBJTYPE) obj_type = pvr_ta_objtype(topology); |
| |
| const bool disable_stencil_write = disable_all; |
| const bool disable_stencil_test = |
| disable_all || !vk_format_has_stencil(attachment->vk.format); |
| |
| const bool disable_depth_write = disable_all; |
| const bool disable_depth_test = disable_all || |
| !vk_format_has_depth(attachment->vk.format); |
| |
| uint32_t ispb_stencil_off; |
| bool is_two_sided = false; |
| uint32_t isp_control; |
| |
| uint32_t line_width; |
| uint32_t common_a; |
| uint32_t front_a; |
| uint32_t front_b; |
| uint32_t back_a; |
| uint32_t back_b; |
| |
| /* Convert to 4.4 fixed point format. */ |
| line_width = util_unsigned_fixed(dynamic_state->line_width, 4); |
| |
| /* Subtract 1 to shift values from range [0=0,256=16] to [0=1/16,255=16]. |
| * If 0 it stays at 0, otherwise we subtract 1. |
| */ |
| line_width = (!!line_width) * (line_width - 1); |
| |
| line_width = MIN2(line_width, PVRX(TA_STATE_ISPA_POINTLINEWIDTH_SIZE_MAX)); |
| |
| /* TODO: Part of the logic in this function is duplicated in another part |
| * of the code. E.g. the dcmpmode, and sop1/2/3. Could we do this earlier? |
| */ |
| |
| pvr_csb_pack (&common_a, TA_STATE_ISPA, ispa) { |
| ispa.pointlinewidth = line_width; |
| |
| if (disable_depth_test) |
| ispa.dcmpmode = PVRX(TA_CMPMODE_ALWAYS); |
| else |
| ispa.dcmpmode = pvr_ta_cmpmode(gfx_pipeline->depth_compare_op); |
| |
| /* FIXME: Can we just have this and remove the assignment above? |
| * The user provides a depthTestEnable at vkCreateGraphicsPipelines() |
| * should we be using that? |
| */ |
| ispa.dcmpmode |= gfx_pipeline->depth_compare_op; |
| |
| ispa.dwritedisable = disable_depth_test || disable_depth_write; |
| /* FIXME: Can we just have this and remove the assignment above? */ |
| ispa.dwritedisable = ispa.dwritedisable || |
| gfx_pipeline->depth_write_disable; |
| |
| ispa.passtype = gfx_pipeline->fragment_shader_state.pass_type; |
| |
| ispa.objtype = obj_type; |
| |
| /* Return unpacked ispa structure. dcmpmode, dwritedisable, passtype and |
| * objtype are needed by pvr_setup_triangle_merging_flag. |
| */ |
| if (ispa_out) |
| *ispa_out = ispa; |
| } |
| |
| /* FIXME: This logic should be redone and improved. Can we also get rid of |
| * the front and back variants? |
| */ |
| |
| pvr_csb_pack (&front_a, TA_STATE_ISPA, ispa) { |
| ispa.sref = (!disable_stencil_test) * dynamic_state->reference.front; |
| } |
| front_a |= common_a; |
| |
| pvr_csb_pack (&back_a, TA_STATE_ISPA, ispa) { |
| ispa.sref = (!disable_stencil_test) * dynamic_state->reference.back; |
| } |
| back_a |= common_a; |
| |
| /* TODO: Does this actually represent the ispb control word on stencil off? |
| * If not, rename the variable. |
| */ |
| pvr_csb_pack (&ispb_stencil_off, TA_STATE_ISPB, ispb) { |
| ispb.sop3 = PVRX(TA_ISPB_STENCILOP_KEEP); |
| ispb.sop2 = PVRX(TA_ISPB_STENCILOP_KEEP); |
| ispb.sop1 = PVRX(TA_ISPB_STENCILOP_KEEP); |
| ispb.scmpmode = PVRX(TA_CMPMODE_ALWAYS); |
| } |
| |
| if (disable_stencil_test) { |
| back_b = front_b = ispb_stencil_off; |
| } else { |
| pvr_csb_pack (&front_b, TA_STATE_ISPB, ispb) { |
| ispb.swmask = |
| (!disable_stencil_write) * dynamic_state->write_mask.front; |
| ispb.scmpmask = dynamic_state->compare_mask.front; |
| |
| ispb.sop3 = pvr_ta_stencilop(gfx_pipeline->stencil_front.pass_op); |
| ispb.sop2 = |
| pvr_ta_stencilop(gfx_pipeline->stencil_front.depth_fail_op); |
| ispb.sop1 = pvr_ta_stencilop(gfx_pipeline->stencil_front.fail_op); |
| |
| ispb.scmpmode = pvr_ta_cmpmode(gfx_pipeline->stencil_front.compare_op); |
| } |
| |
| pvr_csb_pack (&back_b, TA_STATE_ISPB, ispb) { |
| ispb.swmask = |
| (!disable_stencil_write) * dynamic_state->write_mask.back; |
| ispb.scmpmask = dynamic_state->compare_mask.back; |
| |
| ispb.sop3 = pvr_ta_stencilop(gfx_pipeline->stencil_back.pass_op); |
| ispb.sop2 = pvr_ta_stencilop(gfx_pipeline->stencil_back.depth_fail_op); |
| ispb.sop1 = pvr_ta_stencilop(gfx_pipeline->stencil_back.fail_op); |
| |
| ispb.scmpmode = pvr_ta_cmpmode(gfx_pipeline->stencil_back.compare_op); |
| } |
| } |
| |
| if (front_a != back_a || front_b != back_b) { |
| if (cull_mode & VK_CULL_MODE_BACK_BIT) { |
| /* Single face, using front state. */ |
| } else if (cull_mode & VK_CULL_MODE_FRONT_BIT) { |
| /* Single face, using back state. */ |
| |
| front_a = back_a; |
| front_b = back_b; |
| } else { |
| /* Both faces. */ |
| |
| header->pres_ispctl_ba = is_two_sided = true; |
| |
| if (gfx_pipeline->raster_state.front_face == |
| VK_FRONT_FACE_COUNTER_CLOCKWISE) { |
| uint32_t tmp = front_a; |
| |
| front_a = back_a; |
| back_a = tmp; |
| |
| tmp = front_b; |
| front_b = back_b; |
| back_b = tmp; |
| } |
| |
| /* HW defaults to stencil off. */ |
| if (back_b != ispb_stencil_off) { |
| header->pres_ispctl_fb = true; |
| header->pres_ispctl_bb = true; |
| } |
| } |
| } |
| |
| if (!disable_stencil_test && front_b != ispb_stencil_off) |
| header->pres_ispctl_fb = true; |
| |
| pvr_csb_pack (&isp_control, TA_STATE_ISPCTL, ispctl) { |
| ispctl.upass = pass_info->isp_userpass; |
| |
| /* TODO: is bo ever NULL? Figure out what to do. */ |
| ispctl.tagwritedisable = raster_discard_enabled || |
| !gfx_pipeline->fragment_shader_state.bo; |
| |
| ispctl.two_sided = is_two_sided; |
| ispctl.bpres = header->pres_ispctl_fb || header->pres_ispctl_bb; |
| |
| ispctl.dbenable = !raster_discard_enabled && |
| gfx_pipeline->raster_state.depth_bias_enable && |
| obj_type == PVRX(TA_OBJTYPE_TRIANGLE); |
| ispctl.scenable = !raster_discard_enabled; |
| |
| ppp_state->isp.control_struct = ispctl; |
| } |
| |
| header->pres_ispctl = true; |
| |
| ppp_state->isp.control = isp_control; |
| ppp_state->isp.front_a = front_a; |
| ppp_state->isp.front_b = front_b; |
| ppp_state->isp.back_a = back_a; |
| ppp_state->isp.back_b = back_b; |
| } |
| |
| static float |
| pvr_calculate_final_depth_bias_contant_factor(struct pvr_device_info *dev_info, |
| VkFormat format, |
| float depth_bias) |
| { |
| /* Information for future modifiers of these depth bias calculations. |
| * ================================================================== |
| * Specified depth bias equations scale the specified constant factor by a |
| * value 'r' that is guaranteed to cause a resolvable difference in depth |
| * across the entire range of depth values. |
| * For floating point depth formats 'r' is calculated by taking the maximum |
| * exponent across the triangle. |
| * For UNORM formats 'r' is constant. |
| * Here 'n' is the number of mantissa bits stored in the floating point |
| * representation (23 for F32). |
| * |
| * UNORM Format -> z += dbcf * r + slope |
| * FLOAT Format -> z += dbcf * 2^(e-n) + slope |
| * |
| * HW Variations. |
| * ============== |
| * The HW either always performs the F32 depth bias equation (exponent based |
| * r), or in the case of HW that correctly supports the integer depth bias |
| * equation for UNORM depth formats, we can select between both equations |
| * using the ROGUE_CR_ISP_CTL.dbias_is_int flag - this is required to |
| * correctly perform Vulkan UNORM depth bias (constant r). |
| * |
| * if ern42307: |
| * if DBIAS_IS_INT_EN: |
| * z += dbcf + slope |
| * else: |
| * z += dbcf * 2^(e-n) + slope |
| * else: |
| * z += dbcf * 2^(e-n) + slope |
| * |
| */ |
| |
| float nudge_factor; |
| |
| if (PVR_HAS_ERN(dev_info, 42307)) { |
| switch (format) { |
| case VK_FORMAT_D16_UNORM: |
| return depth_bias / (1 << 15); |
| |
| case VK_FORMAT_D24_UNORM_S8_UINT: |
| case VK_FORMAT_X8_D24_UNORM_PACK32: |
| return depth_bias / (1 << 23); |
| |
| default: |
| return depth_bias; |
| } |
| } |
| |
| /* The reasoning behind clamping/nudging the value here is because UNORM |
| * depth formats can have higher precision over our underlying D32F |
| * representation for some depth ranges. |
| * |
| * When the HW scales the depth bias value by 2^(e-n) [The 'r' term'] a depth |
| * bias of 1 can result in a value smaller than one F32 ULP, which will get |
| * quantized to 0 - resulting in no bias. |
| * |
| * Biasing small values away from zero will ensure that small depth biases of |
| * 1 still yield a result and overcome Z-fighting. |
| */ |
| switch (format) { |
| case VK_FORMAT_D16_UNORM: |
| depth_bias *= 512.0f; |
| nudge_factor = 1.0f; |
| break; |
| |
| case VK_FORMAT_D24_UNORM_S8_UINT: |
| case VK_FORMAT_X8_D24_UNORM_PACK32: |
| depth_bias *= 2.0f; |
| nudge_factor = 2.0f; |
| break; |
| |
| default: |
| nudge_factor = 0.0f; |
| break; |
| } |
| |
| if (nudge_factor != 0.0f) { |
| if (depth_bias < 0.0f && depth_bias > -nudge_factor) |
| depth_bias -= nudge_factor; |
| else if (depth_bias > 0.0f && depth_bias < nudge_factor) |
| depth_bias += nudge_factor; |
| } |
| |
| return depth_bias; |
| } |
| |
| static void pvr_get_viewport_scissor_overlap(const VkViewport *const viewport, |
| const VkRect2D *const scissor, |
| VkRect2D *const rect_out) |
| { |
| /* TODO: See if we can remove this struct. */ |
| struct pvr_rect { |
| int32_t x0, y0; |
| int32_t x1, y1; |
| }; |
| |
| /* TODO: Worry about overflow? */ |
| const struct pvr_rect scissor_rect = { |
| .x0 = scissor->offset.x, |
| .y0 = scissor->offset.y, |
| .x1 = scissor->offset.x + scissor->extent.width, |
| .y1 = scissor->offset.y + scissor->extent.height |
| }; |
| struct pvr_rect viewport_rect = { 0 }; |
| |
| assert(viewport->width >= 0.0f); |
| assert(scissor_rect.x0 >= 0); |
| assert(scissor_rect.y0 >= 0); |
| |
| if (scissor->extent.width == 0 || scissor->extent.height == 0) { |
| *rect_out = (VkRect2D){ 0 }; |
| return; |
| } |
| |
| viewport_rect.x0 = (int32_t)viewport->x; |
| viewport_rect.x1 = (int32_t)viewport->x + (int32_t)viewport->width; |
| |
| /* TODO: Is there a mathematical way of doing all this and then clamp at |
| * the end? |
| */ |
| /* We flip the y0 and y1 when height is negative. */ |
| viewport_rect.y0 = (int32_t)viewport->y + MIN2(0, (int32_t)viewport->height); |
| viewport_rect.y1 = (int32_t)viewport->y + MAX2(0, (int32_t)viewport->height); |
| |
| if (scissor_rect.x1 <= viewport_rect.x0 || |
| scissor_rect.y1 <= viewport_rect.y0 || |
| scissor_rect.x0 >= viewport_rect.x1 || |
| scissor_rect.y0 >= viewport_rect.y1) { |
| *rect_out = (VkRect2D){ 0 }; |
| return; |
| } |
| |
| /* Determine the overlapping rectangle. */ |
| viewport_rect.x0 = MAX2(viewport_rect.x0, scissor_rect.x0); |
| viewport_rect.y0 = MAX2(viewport_rect.y0, scissor_rect.y0); |
| viewport_rect.x1 = MIN2(viewport_rect.x1, scissor_rect.x1); |
| viewport_rect.y1 = MIN2(viewport_rect.y1, scissor_rect.y1); |
| |
| /* TODO: Is this conversion safe? Is this logic right? */ |
| rect_out->offset.x = (uint32_t)viewport_rect.x0; |
| rect_out->offset.y = (uint32_t)viewport_rect.y0; |
| rect_out->extent.height = (uint32_t)(viewport_rect.y1 - viewport_rect.y0); |
| rect_out->extent.width = (uint32_t)(viewport_rect.x1 - viewport_rect.x0); |
| } |
| |
| static inline uint32_t |
| pvr_get_geom_region_clip_align_size(struct pvr_device_info *const dev_info) |
| { |
| /* TODO: This should come from rogue_ppp.xml. */ |
| return 16U + 16U * (!PVR_HAS_FEATURE(dev_info, tile_size_16x16)); |
| } |
| |
| static void |
| pvr_setup_isp_depth_bias_scissor_state(struct pvr_cmd_buffer *const cmd_buffer) |
| { |
| struct PVRX(TA_STATE_HEADER) *const header = &cmd_buffer->state.emit_header; |
| struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state; |
| const struct pvr_dynamic_state *const dynamic_state = |
| &cmd_buffer->state.dynamic.common; |
| const struct PVRX(TA_STATE_ISPCTL) *const ispctl = |
| &ppp_state->isp.control_struct; |
| struct pvr_device_info *const dev_info = |
| &cmd_buffer->device->pdevice->dev_info; |
| |
| if (ispctl->dbenable && (cmd_buffer->state.dirty.depth_bias || |
| cmd_buffer->depth_bias_array.size == 0)) { |
| struct pvr_depth_bias_state depth_bias = dynamic_state->depth_bias; |
| |
| depth_bias.constant_factor = |
| pvr_calculate_final_depth_bias_contant_factor( |
| dev_info, |
| cmd_buffer->state.depth_format, |
| depth_bias.constant_factor); |
| |
| ppp_state->depthbias_scissor_indices.depthbias_index = |
| util_dynarray_num_elements(&cmd_buffer->depth_bias_array, |
| __typeof__(depth_bias)); |
| |
| util_dynarray_append(&cmd_buffer->depth_bias_array, |
| __typeof__(depth_bias), |
| depth_bias); |
| |
| header->pres_ispctl_dbsc = true; |
| } |
| |
| if (ispctl->scenable) { |
| const uint32_t region_clip_align_size = |
| pvr_get_geom_region_clip_align_size(dev_info); |
| const VkViewport *const viewport = &dynamic_state->viewport.viewports[0]; |
| const VkRect2D *const scissor = &dynamic_state->scissor.scissors[0]; |
| VkRect2D overlap_rect; |
| uint32_t scissor_words[2]; |
| uint32_t height; |
| uint32_t width; |
| uint32_t x; |
| uint32_t y; |
| |
| /* For region clip. */ |
| uint32_t bottom; |
| uint32_t right; |
| uint32_t left; |
| uint32_t top; |
| |
| /* We don't support multiple viewport calculations. */ |
| assert(dynamic_state->viewport.count == 1); |
| /* We don't support multiple scissor calculations. */ |
| assert(dynamic_state->scissor.count == 1); |
| |
| pvr_get_viewport_scissor_overlap(viewport, scissor, &overlap_rect); |
| |
| x = overlap_rect.offset.x; |
| y = overlap_rect.offset.y; |
| width = overlap_rect.extent.width; |
| height = overlap_rect.extent.height; |
| |
| pvr_csb_pack (&scissor_words[0], IPF_SCISSOR_WORD_0, word0) { |
| word0.scw0_xmax = x + width; |
| word0.scw0_xmin = x; |
| } |
| |
| pvr_csb_pack (&scissor_words[1], IPF_SCISSOR_WORD_1, word1) { |
| word1.scw1_ymax = y + height; |
| word1.scw1_ymin = y; |
| } |
| |
| if (cmd_buffer->scissor_array.size && |
| cmd_buffer->scissor_words[0] == scissor_words[0] && |
| cmd_buffer->scissor_words[1] == scissor_words[1]) { |
| return; |
| } |
| |
| cmd_buffer->scissor_words[0] = scissor_words[0]; |
| cmd_buffer->scissor_words[1] = scissor_words[1]; |
| |
| /* Calculate region clip. */ |
| |
| left = x / region_clip_align_size; |
| top = y / region_clip_align_size; |
| |
| /* We prevent right=-1 with the multiplication. */ |
| /* TODO: Is there a better way of doing this? */ |
| if ((x + width) != 0U) |
| right = DIV_ROUND_UP(x + width, region_clip_align_size) - 1; |
| else |
| right = 0; |
| |
| if ((y + height) != 0U) |
| bottom = DIV_ROUND_UP(y + height, region_clip_align_size) - 1; |
| else |
| bottom = 0U; |
| |
| /* Setup region clip to clip everything outside what was calculated. */ |
| |
| /* FIXME: Should we mask to prevent writing over other words? */ |
| pvr_csb_pack (&ppp_state->region_clipping.word0, TA_REGION_CLIP0, word0) { |
| word0.right = right; |
| word0.left = left; |
| word0.mode = PVRX(TA_REGION_CLIP_MODE_OUTSIDE); |
| } |
| |
| pvr_csb_pack (&ppp_state->region_clipping.word1, TA_REGION_CLIP1, word1) { |
| word1.bottom = bottom; |
| word1.top = top; |
| } |
| |
| ppp_state->depthbias_scissor_indices.scissor_index = |
| util_dynarray_num_elements(&cmd_buffer->scissor_array, |
| __typeof__(cmd_buffer->scissor_words)); |
| |
| memcpy(util_dynarray_grow_bytes(&cmd_buffer->scissor_array, |
| 1, |
| sizeof(cmd_buffer->scissor_words)), |
| cmd_buffer->scissor_words, |
| sizeof(cmd_buffer->scissor_words)); |
| |
| header->pres_ispctl_dbsc = true; |
| header->pres_region_clip = true; |
| } |
| } |
| |
| static void |
| pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer *const cmd_buffer, |
| struct PVRX(TA_STATE_ISPA) * ispa) |
| { |
| struct PVRX(TA_STATE_HEADER) *const header = &cmd_buffer->state.emit_header; |
| struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state; |
| uint32_t merge_word; |
| uint32_t mask; |
| |
| pvr_csb_pack (&merge_word, TA_STATE_PDS_SIZEINFO2, size_info) { |
| /* Disable for lines or punch-through or for DWD and depth compare |
| * always. |
| */ |
| if (ispa->objtype == PVRX(TA_OBJTYPE_LINE) || |
| ispa->passtype == PVRX(TA_PASSTYPE_PUNCH_THROUGH) || |
| (ispa->dwritedisable && ispa->dcmpmode == PVRX(TA_CMPMODE_ALWAYS))) { |
| size_info.pds_tri_merge_disable = true; |
| } |
| } |
| |
| pvr_csb_pack (&mask, TA_STATE_PDS_SIZEINFO2, size_info) { |
| size_info.pds_tri_merge_disable = true; |
| } |
| |
| merge_word |= ppp_state->pds.size_info2 & ~mask; |
| |
| if (merge_word != ppp_state->pds.size_info2) { |
| ppp_state->pds.size_info2 = merge_word; |
| header->pres_pds_state_ptr0 = true; |
| } |
| } |
| |
| static void |
| pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer *const cmd_buffer, |
| struct pvr_sub_cmd_gfx *const sub_cmd) |
| { |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| const struct pvr_stage_allocation_descriptor_state *descriptor_shader_state = |
| &state->gfx_pipeline->fragment_shader_state.descriptor_state; |
| const struct pvr_pds_upload *pds_coeff_program = |
| &state->gfx_pipeline->fragment_shader_state.pds_coeff_program; |
| const struct pvr_pipeline_stage_state *fragment_state = |
| &state->gfx_pipeline->fragment_shader_state.stage_state; |
| const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice; |
| struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header; |
| struct pvr_ppp_state *const ppp_state = &state->ppp_state; |
| |
| const uint32_t pds_uniform_size = |
| DIV_ROUND_UP(descriptor_shader_state->pds_info.data_size_in_dwords, |
| PVRX(TA_STATE_PDS_SIZEINFO1_PDS_UNIFORMSIZE_UNIT_SIZE)); |
| |
| const uint32_t pds_varying_state_size = |
| DIV_ROUND_UP(pds_coeff_program->data_size, |
| PVRX(TA_STATE_PDS_SIZEINFO1_PDS_VARYINGSIZE_UNIT_SIZE)); |
| |
| const uint32_t usc_varying_size = |
| DIV_ROUND_UP(fragment_state->coefficient_size, |
| PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE)); |
| |
| const uint32_t pds_temp_size = |
| DIV_ROUND_UP(fragment_state->temps_count, |
| PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE)); |
| |
| const uint32_t usc_shared_size = |
| DIV_ROUND_UP(fragment_state->const_shared_reg_count, |
| PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE)); |
| |
| const uint32_t max_tiles_in_flight = |
| pvr_calc_fscommon_size_and_tiles_in_flight( |
| pdevice, |
| usc_shared_size * |
| PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE), |
| 1); |
| uint32_t size_info_mask; |
| uint32_t size_info2; |
| |
| if (max_tiles_in_flight < sub_cmd->max_tiles_in_flight) |
| sub_cmd->max_tiles_in_flight = max_tiles_in_flight; |
| |
| pvr_csb_pack (&ppp_state->pds.pixel_shader_base, |
| TA_STATE_PDS_SHADERBASE, |
| shader_base) { |
| const struct pvr_pds_upload *const pds_upload = |
| &state->gfx_pipeline->fragment_shader_state.pds_fragment_program; |
| |
| shader_base.addr = PVR_DEV_ADDR(pds_upload->data_offset); |
| } |
| |
| if (descriptor_shader_state->pds_code.pvr_bo) { |
| pvr_csb_pack (&ppp_state->pds.texture_uniform_code_base, |
| TA_STATE_PDS_TEXUNICODEBASE, |
| tex_base) { |
| tex_base.addr = |
| PVR_DEV_ADDR(descriptor_shader_state->pds_code.code_offset); |
| } |
| } else { |
| ppp_state->pds.texture_uniform_code_base = 0U; |
| } |
| |
| pvr_csb_pack (&ppp_state->pds.size_info1, TA_STATE_PDS_SIZEINFO1, info1) { |
| info1.pds_uniformsize = pds_uniform_size; |
| info1.pds_texturestatesize = 0U; |
| info1.pds_varyingsize = pds_varying_state_size; |
| info1.usc_varyingsize = usc_varying_size; |
| info1.pds_tempsize = pds_temp_size; |
| } |
| |
| pvr_csb_pack (&size_info_mask, TA_STATE_PDS_SIZEINFO2, mask) { |
| mask.pds_tri_merge_disable = true; |
| } |
| |
| ppp_state->pds.size_info2 &= size_info_mask; |
| |
| pvr_csb_pack (&size_info2, TA_STATE_PDS_SIZEINFO2, info2) { |
| info2.usc_sharedsize = usc_shared_size; |
| } |
| |
| ppp_state->pds.size_info2 |= size_info2; |
| |
| if (pds_coeff_program->pvr_bo) { |
| header->pres_pds_state_ptr1 = true; |
| |
| pvr_csb_pack (&ppp_state->pds.varying_base, |
| TA_STATE_PDS_VARYINGBASE, |
| base) { |
| base.addr = PVR_DEV_ADDR(pds_coeff_program->data_offset); |
| } |
| } else { |
| ppp_state->pds.varying_base = 0U; |
| } |
| |
| pvr_csb_pack (&ppp_state->pds.uniform_state_data_base, |
| TA_STATE_PDS_UNIFORMDATABASE, |
| base) { |
| base.addr = PVR_DEV_ADDR(state->pds_fragment_descriptor_data_offset); |
| } |
| |
| header->pres_pds_state_ptr0 = true; |
| header->pres_pds_state_ptr3 = true; |
| } |
| |
| static void pvr_setup_viewport(struct pvr_cmd_buffer *const cmd_buffer) |
| { |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header; |
| struct pvr_ppp_state *const ppp_state = &state->ppp_state; |
| |
| if (ppp_state->viewport_count != state->dynamic.common.viewport.count) { |
| ppp_state->viewport_count = state->dynamic.common.viewport.count; |
| header->pres_viewport = true; |
| } |
| |
| if (state->gfx_pipeline->raster_state.discard_enable) { |
| /* We don't want to emit any viewport data as it'll just get thrown |
| * away. It's after the previous condition because we still want to |
| * stash the viewport_count as it's our trigger for when |
| * rasterizer discard gets disabled. |
| */ |
| header->pres_viewport = false; |
| return; |
| } |
| |
| for (uint32_t i = 0; i < ppp_state->viewport_count; i++) { |
| VkViewport *viewport = &state->dynamic.common.viewport.viewports[i]; |
| uint32_t x_scale = fui(viewport->width * 0.5f); |
| uint32_t y_scale = fui(viewport->height * 0.5f); |
| uint32_t z_scale = fui(viewport->maxDepth - viewport->minDepth); |
| uint32_t x_center = fui(viewport->x + viewport->width * 0.5f); |
| uint32_t y_center = fui(viewport->y + viewport->height * 0.5f); |
| uint32_t z_center = fui(viewport->minDepth); |
| |
| if (ppp_state->viewports[i].a0 != x_center || |
| ppp_state->viewports[i].m0 != x_scale || |
| ppp_state->viewports[i].a1 != y_center || |
| ppp_state->viewports[i].m1 != y_scale || |
| ppp_state->viewports[i].a2 != z_center || |
| ppp_state->viewports[i].m2 != z_scale) { |
| ppp_state->viewports[i].a0 = x_center; |
| ppp_state->viewports[i].m0 = x_scale; |
| ppp_state->viewports[i].a1 = y_center; |
| ppp_state->viewports[i].m1 = y_scale; |
| ppp_state->viewports[i].a2 = z_center; |
| ppp_state->viewports[i].m2 = z_scale; |
| |
| header->pres_viewport = true; |
| } |
| } |
| } |
| |
| static void pvr_setup_ppp_control(struct pvr_cmd_buffer *const cmd_buffer) |
| { |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline; |
| struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header; |
| struct pvr_ppp_state *const ppp_state = &state->ppp_state; |
| uint32_t ppp_control; |
| |
| pvr_csb_pack (&ppp_control, TA_STATE_PPP_CTRL, control) { |
| const struct pvr_raster_state *raster_state = &gfx_pipeline->raster_state; |
| VkPrimitiveTopology topology = gfx_pipeline->input_asm_state.topology; |
| control.drawclippededges = true; |
| control.wclampen = true; |
| |
| if (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN) |
| control.flatshade_vtx = PVRX(TA_FLATSHADE_VTX_VERTEX_1); |
| else |
| control.flatshade_vtx = PVRX(TA_FLATSHADE_VTX_VERTEX_0); |
| |
| if (raster_state->depth_clamp_enable) |
| control.clip_mode = PVRX(TA_CLIP_MODE_NO_FRONT_OR_REAR); |
| else |
| control.clip_mode = PVRX(TA_CLIP_MODE_FRONT_REAR); |
| |
| /* +--- FrontIsCCW? |
| * | +--- Cull Front? |
| * v v |
| * 0|0 CULLMODE_CULL_CCW, |
| * 0|1 CULLMODE_CULL_CW, |
| * 1|0 CULLMODE_CULL_CW, |
| * 1|1 CULLMODE_CULL_CCW, |
| */ |
| switch (raster_state->cull_mode) { |
| case VK_CULL_MODE_BACK_BIT: |
| case VK_CULL_MODE_FRONT_BIT: |
| if ((raster_state->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE) ^ |
| (raster_state->cull_mode == VK_CULL_MODE_FRONT_BIT)) { |
| control.cullmode = PVRX(TA_CULLMODE_CULL_CW); |
| } else { |
| control.cullmode = PVRX(TA_CULLMODE_CULL_CCW); |
| } |
| |
| break; |
| |
| case VK_CULL_MODE_FRONT_AND_BACK: |
| case VK_CULL_MODE_NONE: |
| control.cullmode = PVRX(TA_CULLMODE_NO_CULLING); |
| break; |
| |
| default: |
| unreachable("Unsupported cull mode!"); |
| } |
| } |
| |
| if (ppp_control != ppp_state->ppp_control) { |
| ppp_state->ppp_control = ppp_control; |
| header->pres_ppp_ctrl = true; |
| } |
| } |
| |
| /* Largest valid PPP State update in words = 31 |
| * 1 - Header |
| * 3 - Stream Out Config words 0, 1 and 2 |
| * 1 - PPP Control word |
| * 3 - Varying Config words 0, 1 and 2 |
| * 1 - Output Select |
| * 1 - WClamp |
| * 6 - Viewport Transform words |
| * 2 - Region Clip words |
| * 3 - PDS State for fragment phase (PDSSTATEPTR 1-3) |
| * 4 - PDS State for fragment phase (PDSSTATEPTR0) |
| * 6 - ISP Control Words |
| */ |
| #define PVR_MAX_PPP_STATE_DWORDS 31 |
| |
| static VkResult pvr_emit_ppp_state(struct pvr_cmd_buffer *const cmd_buffer, |
| struct pvr_sub_cmd_gfx *const sub_cmd) |
| { |
| const bool deferred_secondary = pvr_cmd_uses_deferred_cs_cmds(cmd_buffer); |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header; |
| struct pvr_csb *const control_stream = &sub_cmd->control_stream; |
| struct pvr_ppp_state *const ppp_state = &state->ppp_state; |
| uint32_t ppp_state_words[PVR_MAX_PPP_STATE_DWORDS]; |
| const bool emit_dbsc = header->pres_ispctl_dbsc; |
| uint32_t *buffer_ptr = ppp_state_words; |
| uint32_t dbsc_patching_offset = 0; |
| uint32_t ppp_state_words_count; |
| struct pvr_bo *pvr_bo; |
| VkResult result; |
| |
| #if !defined(NDEBUG) |
| struct PVRX(TA_STATE_HEADER) emit_mask = *header; |
| uint32_t packed_emit_mask; |
| |
| static_assert(pvr_cmd_length(TA_STATE_HEADER) == 1, |
| "EMIT_MASK_IS_CLEAR assumes 1 dword sized header."); |
| |
| # define EMIT_MASK_GET(field) (emit_mask.field) |
| # define EMIT_MASK_SET(field, value) (emit_mask.field = (value)) |
| # define EMIT_MASK_IS_CLEAR \ |
| (pvr_cmd_pack(TA_STATE_HEADER)(&packed_emit_mask, &emit_mask), \ |
| packed_emit_mask == 0) |
| #else |
| # define EMIT_MASK_GET(field) |
| # define EMIT_MASK_SET(field, value) |
| #endif |
| |
| header->view_port_count = |
| (ppp_state->viewport_count == 0) ? 0U : (ppp_state->viewport_count - 1); |
| header->pres_ispctl_fa = header->pres_ispctl; |
| |
| /* If deferred_secondary is true then we do a separate state update |
| * which gets patched in vkCmdExecuteCommands(). |
| */ |
| header->pres_ispctl_dbsc &= !deferred_secondary; |
| |
| pvr_csb_write_struct(buffer_ptr, TA_STATE_HEADER, header); |
| |
| static_assert(pvr_cmd_length(TA_STATE_HEADER) == 1, |
| "Following header check assumes 1 dword sized header."); |
| /* If the header is empty we exit early and prevent a bo alloc of 0 size. */ |
| if (ppp_state_words[0] == 0) |
| return VK_SUCCESS; |
| |
| if (header->pres_ispctl) { |
| pvr_csb_write_value(buffer_ptr, TA_STATE_ISPCTL, ppp_state->isp.control); |
| |
| assert(header->pres_ispctl_fa); |
| /* This is not a mistake. FA, BA have the ISPA format, and FB, BB have the |
| * ISPB format. |
| */ |
| pvr_csb_write_value(buffer_ptr, TA_STATE_ISPA, ppp_state->isp.front_a); |
| EMIT_MASK_SET(pres_ispctl_fa, false); |
| |
| if (header->pres_ispctl_fb) { |
| pvr_csb_write_value(buffer_ptr, TA_STATE_ISPB, ppp_state->isp.front_b); |
| EMIT_MASK_SET(pres_ispctl_fb, false); |
| } |
| |
| if (header->pres_ispctl_ba) { |
| pvr_csb_write_value(buffer_ptr, TA_STATE_ISPA, ppp_state->isp.back_a); |
| EMIT_MASK_SET(pres_ispctl_ba, false); |
| } |
| |
| if (header->pres_ispctl_bb) { |
| pvr_csb_write_value(buffer_ptr, TA_STATE_ISPB, ppp_state->isp.back_b); |
| EMIT_MASK_SET(pres_ispctl_bb, false); |
| } |
| |
| EMIT_MASK_SET(pres_ispctl, false); |
| } |
| |
| if (header->pres_ispctl_dbsc) { |
| assert(!deferred_secondary); |
| |
| dbsc_patching_offset = buffer_ptr - ppp_state_words; |
| |
| pvr_csb_pack (buffer_ptr, TA_STATE_ISPDBSC, ispdbsc) { |
| ispdbsc.dbindex = ppp_state->depthbias_scissor_indices.depthbias_index; |
| ispdbsc.scindex = ppp_state->depthbias_scissor_indices.scissor_index; |
| } |
| buffer_ptr += pvr_cmd_length(TA_STATE_ISPDBSC); |
| |
| EMIT_MASK_SET(pres_ispctl_dbsc, false); |
| } |
| |
| if (header->pres_pds_state_ptr0) { |
| pvr_csb_write_value(buffer_ptr, |
| TA_STATE_PDS_SHADERBASE, |
| ppp_state->pds.pixel_shader_base); |
| |
| pvr_csb_write_value(buffer_ptr, |
| TA_STATE_PDS_TEXUNICODEBASE, |
| ppp_state->pds.texture_uniform_code_base); |
| |
| pvr_csb_write_value(buffer_ptr, |
| TA_STATE_PDS_SIZEINFO1, |
| ppp_state->pds.size_info1); |
| pvr_csb_write_value(buffer_ptr, |
| TA_STATE_PDS_SIZEINFO2, |
| ppp_state->pds.size_info2); |
| |
| EMIT_MASK_SET(pres_pds_state_ptr0, false); |
| } |
| |
| if (header->pres_pds_state_ptr1) { |
| pvr_csb_write_value(buffer_ptr, |
| TA_STATE_PDS_VARYINGBASE, |
| ppp_state->pds.varying_base); |
| EMIT_MASK_SET(pres_pds_state_ptr1, false); |
| } |
| |
| /* We don't use pds_state_ptr2 (texture state programs) control word, but |
| * this doesn't mean we need to set it to 0. This is because the hardware |
| * runs the texture state program only when |
| * ROGUE_TA_STATE_PDS_SIZEINFO1.pds_texturestatesize is non-zero. |
| */ |
| assert(pvr_csb_unpack(&ppp_state->pds.size_info1, TA_STATE_PDS_SIZEINFO1) |
| .pds_texturestatesize == 0); |
| |
| if (header->pres_pds_state_ptr3) { |
| pvr_csb_write_value(buffer_ptr, |
| TA_STATE_PDS_UNIFORMDATABASE, |
| ppp_state->pds.uniform_state_data_base); |
| EMIT_MASK_SET(pres_pds_state_ptr3, false); |
| } |
| |
| if (header->pres_region_clip) { |
| pvr_csb_write_value(buffer_ptr, |
| TA_REGION_CLIP0, |
| ppp_state->region_clipping.word0); |
| pvr_csb_write_value(buffer_ptr, |
| TA_REGION_CLIP1, |
| ppp_state->region_clipping.word1); |
| |
| EMIT_MASK_SET(pres_region_clip, false); |
| } |
| |
| if (header->pres_viewport) { |
| const uint32_t viewports = MAX2(1, ppp_state->viewport_count); |
| EMIT_MASK_SET(view_port_count, viewports); |
| |
| for (uint32_t i = 0; i < viewports; i++) { |
| /* These don't have any definitions in the csbgen xml files and none |
| * will be added. |
| */ |
| *buffer_ptr++ = ppp_state->viewports[i].a0; |
| *buffer_ptr++ = ppp_state->viewports[i].m0; |
| *buffer_ptr++ = ppp_state->viewports[i].a1; |
| *buffer_ptr++ = ppp_state->viewports[i].m1; |
| *buffer_ptr++ = ppp_state->viewports[i].a2; |
| *buffer_ptr++ = ppp_state->viewports[i].m2; |
| |
| EMIT_MASK_SET(view_port_count, EMIT_MASK_GET(view_port_count) - 1); |
| } |
| |
| EMIT_MASK_SET(pres_viewport, false); |
| } |
| |
| if (header->pres_wclamp) { |
| pvr_csb_pack (buffer_ptr, TA_WCLAMP, wclamp) { |
| wclamp.val = fui(0.00001f); |
| } |
| buffer_ptr += pvr_cmd_length(TA_WCLAMP); |
| EMIT_MASK_SET(pres_wclamp, false); |
| } |
| |
| if (header->pres_outselects) { |
| pvr_csb_write_value(buffer_ptr, TA_OUTPUT_SEL, ppp_state->output_selects); |
| EMIT_MASK_SET(pres_outselects, false); |
| } |
| |
| if (header->pres_varying_word0) { |
| pvr_csb_write_value(buffer_ptr, |
| TA_STATE_VARYING0, |
| ppp_state->varying_word[0]); |
| EMIT_MASK_SET(pres_varying_word0, false); |
| } |
| |
| if (header->pres_varying_word1) { |
| pvr_csb_write_value(buffer_ptr, |
| TA_STATE_VARYING1, |
| ppp_state->varying_word[1]); |
| EMIT_MASK_SET(pres_varying_word1, false); |
| } |
| |
| /* We only emit this on the first draw of a render job to prevent us from |
| * inheriting a non-zero value set elsewhere. |
| */ |
| if (header->pres_varying_word2) { |
| pvr_csb_write_value(buffer_ptr, TA_STATE_VARYING2, 0); |
| EMIT_MASK_SET(pres_varying_word2, false); |
| } |
| |
| if (header->pres_ppp_ctrl) { |
| pvr_csb_write_value(buffer_ptr, |
| TA_STATE_PPP_CTRL, |
| ppp_state->ppp_control); |
| EMIT_MASK_SET(pres_ppp_ctrl, false); |
| } |
| |
| /* We only emit this on the first draw of a render job to prevent us from |
| * inheriting a non-zero value set elsewhere. |
| */ |
| if (header->pres_stream_out_size) { |
| pvr_csb_write_value(buffer_ptr, TA_STATE_STREAM_OUT0, 0); |
| EMIT_MASK_SET(pres_stream_out_size, false); |
| } |
| |
| assert(EMIT_MASK_IS_CLEAR); |
| |
| #undef EMIT_MASK_GET |
| #undef EMIT_MASK_SET |
| #if !defined(NDEBUG) |
| # undef EMIT_MASK_IS_CLEAR |
| #endif |
| |
| ppp_state_words_count = buffer_ptr - ppp_state_words; |
| assert(ppp_state_words_count <= PVR_MAX_PPP_STATE_DWORDS); |
| |
| result = pvr_cmd_buffer_alloc_mem(cmd_buffer, |
| cmd_buffer->device->heaps.general_heap, |
| ppp_state_words_count * sizeof(uint32_t), |
| PVR_BO_ALLOC_FLAG_CPU_MAPPED, |
| &pvr_bo); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| memcpy(pvr_bo->bo->map, |
| ppp_state_words, |
| ppp_state_words_count * sizeof(uint32_t)); |
| |
| /* Write the VDM state update into the VDM control stream. */ |
| pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE0, state0) { |
| state0.word_count = ppp_state_words_count; |
| state0.addrmsb = pvr_bo->vma->dev_addr; |
| } |
| |
| pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE1, state1) { |
| state1.addrlsb = pvr_bo->vma->dev_addr; |
| } |
| |
| if (emit_dbsc && cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { |
| struct pvr_deferred_cs_command cmd; |
| |
| if (deferred_secondary) { |
| const uint32_t num_dwords = pvr_cmd_length(VDMCTRL_PPP_STATE0) + |
| pvr_cmd_length(VDMCTRL_PPP_STATE1); |
| |
| uint32_t *vdm_state = pvr_csb_alloc_dwords(control_stream, num_dwords); |
| if (!vdm_state) { |
| cmd_buffer->state.status = pvr_csb_get_status(control_stream); |
| return cmd_buffer->state.status; |
| } |
| |
| cmd = (struct pvr_deferred_cs_command){ |
| .type = PVR_DEFERRED_CS_COMMAND_TYPE_DBSC, |
| .dbsc = { |
| .state = ppp_state->depthbias_scissor_indices, |
| .vdm_state = vdm_state, |
| }, |
| }; |
| } else { |
| cmd = (struct pvr_deferred_cs_command){ |
| .type = PVR_DEFERRED_CS_COMMAND_TYPE_DBSC2, |
| .dbsc2 = { |
| .state = ppp_state->depthbias_scissor_indices, |
| .ppp_cs_bo = pvr_bo, |
| .patch_offset = dbsc_patching_offset, |
| }, |
| }; |
| } |
| |
| util_dynarray_append(&cmd_buffer->deferred_csb_commands, |
| struct pvr_deferred_cs_command, |
| cmd); |
| } |
| |
| state->emit_header = (struct PVRX(TA_STATE_HEADER)){ 0 }; |
| |
| return VK_SUCCESS; |
| } |
| |
| static inline bool |
| pvr_ppp_state_update_required(const struct pvr_cmd_buffer_state *state) |
| { |
| const struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header; |
| return header->pres_ppp_ctrl || header->pres_ispctl || |
| header->pres_ispctl_fb || header->pres_ispctl_ba || |
| header->pres_ispctl_bb || header->pres_ispctl_dbsc || |
| header->pres_pds_state_ptr0 || header->pres_pds_state_ptr1 || |
| header->pres_pds_state_ptr2 || header->pres_pds_state_ptr3 || |
| header->pres_region_clip || header->pres_viewport || |
| header->pres_wclamp || header->pres_outselects || |
| header->pres_varying_word0 || header->pres_varying_word1 || |
| header->pres_varying_word2 || header->pres_stream_out_program; |
| } |
| |
| static VkResult |
| pvr_emit_dirty_ppp_state(struct pvr_cmd_buffer *const cmd_buffer, |
| struct pvr_sub_cmd_gfx *const sub_cmd) |
| { |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline; |
| const bool dirty_stencil = state->dirty.compare_mask || |
| state->dirty.write_mask || state->dirty.reference; |
| VkResult result; |
| |
| /* TODO: The emit_header will be dirty only if |
| * pvr_reset_graphics_dirty_state() was called before this (so when command |
| * buffer begins recording or when it's reset). Otherwise it will have been |
| * zeroed out by the previous pvr_emit_ppp_state(). We can probably set a |
| * flag in there and check it here instead of checking the header. |
| * Check if this is true and implement the flag. |
| */ |
| if (!(dirty_stencil || state->dirty.depth_bias || |
| state->dirty.fragment_descriptors || state->dirty.line_width || |
| state->dirty.gfx_pipeline_binding || state->dirty.scissor || |
| state->dirty.isp_userpass || state->dirty.viewport || |
| pvr_ppp_state_update_required(state))) { |
| return VK_SUCCESS; |
| } |
| |
| if (state->dirty.gfx_pipeline_binding) { |
| struct PVRX(TA_STATE_ISPA) ispa; |
| |
| pvr_setup_output_select(cmd_buffer); |
| pvr_setup_isp_faces_and_control(cmd_buffer, &ispa); |
| pvr_setup_triangle_merging_flag(cmd_buffer, &ispa); |
| } else if (dirty_stencil || state->dirty.line_width || |
| state->dirty.isp_userpass) { |
| pvr_setup_isp_faces_and_control(cmd_buffer, NULL); |
| } |
| |
| if (!gfx_pipeline->raster_state.discard_enable && |
| state->dirty.fragment_descriptors && |
| gfx_pipeline->fragment_shader_state.bo) { |
| pvr_setup_fragment_state_pointers(cmd_buffer, sub_cmd); |
| } |
| |
| pvr_setup_isp_depth_bias_scissor_state(cmd_buffer); |
| |
| if (state->dirty.viewport) |
| pvr_setup_viewport(cmd_buffer); |
| |
| pvr_setup_ppp_control(cmd_buffer); |
| |
| /* The hardware doesn't have an explicit mode for this so we use a |
| * negative viewport to make sure all objects are culled out early. |
| */ |
| if (gfx_pipeline->raster_state.cull_mode == VK_CULL_MODE_FRONT_AND_BACK) { |
| /* Shift the viewport out of the guard-band culling everything. */ |
| const uint32_t negative_vp_val = fui(-2.0f); |
| |
| state->ppp_state.viewports[0].a0 = negative_vp_val; |
| state->ppp_state.viewports[0].m0 = 0; |
| state->ppp_state.viewports[0].a1 = negative_vp_val; |
| state->ppp_state.viewports[0].m1 = 0; |
| state->ppp_state.viewports[0].a2 = negative_vp_val; |
| state->ppp_state.viewports[0].m2 = 0; |
| |
| state->ppp_state.viewport_count = 1; |
| |
| state->emit_header.pres_viewport = true; |
| } |
| |
| result = pvr_emit_ppp_state(cmd_buffer, sub_cmd); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| return VK_SUCCESS; |
| } |
| |
| void pvr_calculate_vertex_cam_size(const struct pvr_device_info *dev_info, |
| const uint32_t vs_output_size, |
| const bool raster_enable, |
| uint32_t *const cam_size_out, |
| uint32_t *const vs_max_instances_out) |
| { |
| /* First work out the size of a vertex in the UVS and multiply by 4 for |
| * column ordering. |
| */ |
| const uint32_t uvs_vertex_vector_size_in_dwords = |
| (vs_output_size + 1U + raster_enable * 4U) * 4U; |
| const uint32_t vdm_cam_size = |
| PVR_GET_FEATURE_VALUE(dev_info, vdm_cam_size, 32U); |
| |
| /* This is a proxy for 8XE. */ |
| if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format) && |
| vdm_cam_size < 96U) { |
| /* Comparisons are based on size including scratch per vertex vector. */ |
| if (uvs_vertex_vector_size_in_dwords < (14U * 4U)) { |
| *cam_size_out = MIN2(31U, vdm_cam_size - 1U); |
| *vs_max_instances_out = 16U; |
| } else if (uvs_vertex_vector_size_in_dwords < (20U * 4U)) { |
| *cam_size_out = 15U; |
| *vs_max_instances_out = 16U; |
| } else if (uvs_vertex_vector_size_in_dwords < (28U * 4U)) { |
| *cam_size_out = 11U; |
| *vs_max_instances_out = 12U; |
| } else if (uvs_vertex_vector_size_in_dwords < (44U * 4U)) { |
| *cam_size_out = 7U; |
| *vs_max_instances_out = 8U; |
| } else if (PVR_HAS_FEATURE(dev_info, |
| simple_internal_parameter_format_v2) || |
| uvs_vertex_vector_size_in_dwords < (64U * 4U)) { |
| *cam_size_out = 7U; |
| *vs_max_instances_out = 4U; |
| } else { |
| *cam_size_out = 3U; |
| *vs_max_instances_out = 2U; |
| } |
| } else { |
| /* Comparisons are based on size including scratch per vertex vector. */ |
| if (uvs_vertex_vector_size_in_dwords <= (32U * 4U)) { |
| /* output size <= 27 + 5 scratch. */ |
| *cam_size_out = MIN2(95U, vdm_cam_size - 1U); |
| *vs_max_instances_out = 0U; |
| } else if (uvs_vertex_vector_size_in_dwords <= 48U * 4U) { |
| /* output size <= 43 + 5 scratch */ |
| *cam_size_out = 63U; |
| if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U) |
| *vs_max_instances_out = 16U; |
| else |
| *vs_max_instances_out = 0U; |
| } else if (uvs_vertex_vector_size_in_dwords <= 64U * 4U) { |
| /* output size <= 59 + 5 scratch. */ |
| *cam_size_out = 31U; |
| if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U) |
| *vs_max_instances_out = 16U; |
| else |
| *vs_max_instances_out = 0U; |
| } else { |
| *cam_size_out = 15U; |
| *vs_max_instances_out = 16U; |
| } |
| } |
| } |
| |
| static void |
| pvr_emit_dirty_vdm_state(const struct pvr_cmd_buffer *const cmd_buffer, |
| struct pvr_sub_cmd_gfx *const sub_cmd) |
| { |
| /* FIXME: Assume all state is dirty for the moment. */ |
| struct pvr_device_info *const dev_info = |
| &cmd_buffer->device->pdevice->dev_info; |
| ASSERTED const uint32_t max_user_vertex_output_components = |
| pvr_get_max_user_vertex_output_components(dev_info); |
| struct PVRX(VDMCTRL_VDM_STATE0) |
| header = { pvr_cmd_header(VDMCTRL_VDM_STATE0) }; |
| const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline; |
| struct pvr_csb *const csb = &sub_cmd->control_stream; |
| uint32_t vs_output_size; |
| uint32_t max_instances; |
| uint32_t cam_size; |
| |
| assert(gfx_pipeline); |
| |
| /* CAM Calculations and HW state take vertex size aligned to DWORDS. */ |
| vs_output_size = |
| DIV_ROUND_UP(gfx_pipeline->vertex_shader_state.vertex_output_size, |
| PVRX(VDMCTRL_VDM_STATE4_VS_OUTPUT_SIZE_UNIT_SIZE)); |
| |
| assert(vs_output_size <= max_user_vertex_output_components); |
| |
| pvr_calculate_vertex_cam_size(dev_info, |
| vs_output_size, |
| true, |
| &cam_size, |
| &max_instances); |
| |
| pvr_csb_emit (csb, VDMCTRL_VDM_STATE0, state0) { |
| state0.cam_size = cam_size; |
| |
| if (gfx_pipeline->input_asm_state.primitive_restart) { |
| state0.cut_index_enable = true; |
| state0.cut_index_present = true; |
| } |
| |
| switch (gfx_pipeline->input_asm_state.topology) { |
| case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN: |
| state0.flatshade_control = PVRX(VDMCTRL_FLATSHADE_CONTROL_VERTEX_1); |
| break; |
| |
| default: |
| state0.flatshade_control = PVRX(VDMCTRL_FLATSHADE_CONTROL_VERTEX_0); |
| break; |
| } |
| |
| /* If we've bound a different vertex buffer, or this draw-call requires |
| * a different PDS attrib data-section from the last draw call (changed |
| * base_instance) then we need to specify a new data section. This is |
| * also the case if we've switched pipeline or attrib program as the |
| * data-section layout will be different. |
| */ |
| state0.vs_data_addr_present = |
| state->dirty.gfx_pipeline_binding || state->dirty.vertex_bindings || |
| state->dirty.draw_base_instance || state->dirty.draw_variant; |
| |
| /* Need to specify new PDS Attrib program if we've bound a different |
| * pipeline or we needed a different PDS Attrib variant for this |
| * draw-call. |
| */ |
| state0.vs_other_present = state->dirty.gfx_pipeline_binding || |
| state->dirty.draw_variant; |
| |
| /* UVB_SCRATCH_SELECT_ONE with no rasterization is only valid when |
| * stream output is enabled. We use UVB_SCRATCH_SELECT_FIVE because |
| * Vulkan doesn't support stream output and the vertex position is |
| * always emitted to the UVB. |
| */ |
| state0.uvs_scratch_size_select = |
| PVRX(VDMCTRL_UVS_SCRATCH_SIZE_SELECT_FIVE); |
| |
| header = state0; |
| } |
| |
| if (header.cut_index_present) { |
| pvr_csb_emit (csb, VDMCTRL_VDM_STATE1, state1) { |
| switch (state->index_buffer_binding.type) { |
| case VK_INDEX_TYPE_UINT32: |
| /* FIXME: Defines for these? These seem to come from the Vulkan |
| * spec. for VkPipelineInputAssemblyStateCreateInfo |
| * primitiveRestartEnable. |
| */ |
| state1.cut_index = 0xFFFFFFFF; |
| break; |
| |
| case VK_INDEX_TYPE_UINT16: |
| state1.cut_index = 0xFFFF; |
| break; |
| |
| default: |
| unreachable(!"Invalid index type"); |
| } |
| } |
| } |
| |
| if (header.vs_data_addr_present) { |
| pvr_csb_emit (csb, VDMCTRL_VDM_STATE2, state2) { |
| state2.vs_pds_data_base_addr = |
| PVR_DEV_ADDR(state->pds_vertex_attrib_offset); |
| } |
| } |
| |
| if (header.vs_other_present) { |
| const uint32_t usc_unified_store_size_in_bytes = |
| gfx_pipeline->vertex_shader_state.vertex_input_size << 2; |
| |
| pvr_csb_emit (csb, VDMCTRL_VDM_STATE3, state3) { |
| state3.vs_pds_code_base_addr = |
| PVR_DEV_ADDR(state->pds_shader.code_offset); |
| } |
| |
| pvr_csb_emit (csb, VDMCTRL_VDM_STATE4, state4) { |
| state4.vs_output_size = vs_output_size; |
| } |
| |
| pvr_csb_emit (csb, VDMCTRL_VDM_STATE5, state5) { |
| state5.vs_max_instances = max_instances; |
| state5.vs_usc_common_size = 0U; |
| state5.vs_usc_unified_size = DIV_ROUND_UP( |
| usc_unified_store_size_in_bytes, |
| PVRX(VDMCTRL_VDM_STATE5_VS_USC_UNIFIED_SIZE_UNIT_SIZE)); |
| state5.vs_pds_temp_size = |
| DIV_ROUND_UP(state->pds_shader.info->temps_required << 2, |
| PVRX(VDMCTRL_VDM_STATE5_VS_PDS_TEMP_SIZE_UNIT_SIZE)); |
| state5.vs_pds_data_size = |
| DIV_ROUND_UP(state->pds_shader.info->data_size_in_dwords << 2, |
| PVRX(VDMCTRL_VDM_STATE5_VS_PDS_DATA_SIZE_UNIT_SIZE)); |
| } |
| } |
| } |
| |
| static VkResult pvr_validate_draw_state(struct pvr_cmd_buffer *cmd_buffer) |
| { |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline; |
| const struct pvr_pipeline_layout *const pipeline_layout = |
| gfx_pipeline->base.layout; |
| const struct pvr_pipeline_stage_state *const fragment_state = |
| &gfx_pipeline->fragment_shader_state.stage_state; |
| struct pvr_sub_cmd_gfx *sub_cmd; |
| bool fstencil_writemask_zero; |
| bool bstencil_writemask_zero; |
| bool fstencil_keep; |
| bool bstencil_keep; |
| VkResult result; |
| |
| pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS); |
| |
| sub_cmd = &state->current_sub_cmd->gfx; |
| sub_cmd->empty_cmd = false; |
| |
| /* Determine pipeline depth/stencil usage. If a pipeline uses depth or |
| * stencil testing, those attachments are using their loaded values, and |
| * the loadOps cannot be optimized out. |
| */ |
| /* Pipeline uses depth testing. */ |
| if (sub_cmd->depth_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED && |
| gfx_pipeline->depth_compare_op != VK_COMPARE_OP_ALWAYS) { |
| sub_cmd->depth_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED; |
| } |
| |
| /* Pipeline uses stencil testing. */ |
| if (sub_cmd->stencil_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED && |
| (gfx_pipeline->stencil_front.compare_op != VK_COMPARE_OP_ALWAYS || |
| gfx_pipeline->stencil_back.compare_op != VK_COMPARE_OP_ALWAYS)) { |
| sub_cmd->stencil_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED; |
| } |
| |
| if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info, |
| compute_overlap)) { |
| uint32_t coefficient_size = |
| DIV_ROUND_UP(fragment_state->coefficient_size, |
| PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE)); |
| |
| if (coefficient_size > |
| PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_MAX_SIZE)) |
| sub_cmd->disable_compute_overlap = true; |
| } |
| |
| sub_cmd->frag_uses_atomic_ops |= fragment_state->uses_atomic_ops; |
| sub_cmd->frag_has_side_effects |= fragment_state->has_side_effects; |
| sub_cmd->frag_uses_texture_rw |= fragment_state->uses_texture_rw; |
| sub_cmd->vertex_uses_texture_rw |= |
| gfx_pipeline->vertex_shader_state.stage_state.uses_texture_rw; |
| |
| fstencil_keep = |
| (gfx_pipeline->stencil_front.fail_op == VK_STENCIL_OP_KEEP) && |
| (gfx_pipeline->stencil_front.pass_op == VK_STENCIL_OP_KEEP); |
| bstencil_keep = (gfx_pipeline->stencil_back.fail_op == VK_STENCIL_OP_KEEP) && |
| (gfx_pipeline->stencil_back.pass_op == VK_STENCIL_OP_KEEP); |
| fstencil_writemask_zero = (state->dynamic.common.write_mask.front == 0); |
| bstencil_writemask_zero = (state->dynamic.common.write_mask.back == 0); |
| |
| /* Set stencil modified flag if: |
| * - Neither front nor back-facing stencil has a fail_op/pass_op of KEEP. |
| * - Neither front nor back-facing stencil has a write_mask of zero. |
| */ |
| if (!(fstencil_keep && bstencil_keep) && |
| !(fstencil_writemask_zero && bstencil_writemask_zero)) { |
| sub_cmd->modifies_stencil = true; |
| } |
| |
| /* Set depth modified flag if depth write is enabled. */ |
| if (!gfx_pipeline->depth_write_disable) |
| sub_cmd->modifies_depth = true; |
| |
| /* If either the data or code changes for pds vertex attribs, regenerate the |
| * data segment. |
| */ |
| if (state->dirty.vertex_bindings || state->dirty.gfx_pipeline_binding || |
| state->dirty.draw_variant || state->dirty.draw_base_instance) { |
| enum pvr_pds_vertex_attrib_program_type prog_type; |
| const struct pvr_pds_attrib_program *program; |
| |
| if (state->draw_state.draw_indirect) |
| prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT; |
| else if (state->draw_state.base_instance) |
| prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE; |
| else |
| prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC; |
| |
| program = |
| &gfx_pipeline->vertex_shader_state.pds_attrib_programs[prog_type]; |
| state->pds_shader.info = &program->info; |
| state->pds_shader.code_offset = program->program.code_offset; |
| |
| state->max_shared_regs = |
| MAX2(state->max_shared_regs, pvr_calc_shared_regs_count(gfx_pipeline)); |
| |
| pvr_setup_vertex_buffers(cmd_buffer, gfx_pipeline); |
| } |
| |
| /* TODO: Check for dirty push constants */ |
| |
| state->dirty.vertex_descriptors = state->dirty.gfx_pipeline_binding; |
| state->dirty.fragment_descriptors = state->dirty.vertex_descriptors; |
| |
| /* Account for dirty descriptor set. */ |
| state->dirty.vertex_descriptors |= |
| state->dirty.gfx_desc_dirty && |
| pipeline_layout |
| ->per_stage_descriptor_masks[PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY]; |
| state->dirty.fragment_descriptors |= |
| state->dirty.gfx_desc_dirty && |
| pipeline_layout->per_stage_descriptor_masks[PVR_STAGE_ALLOCATION_FRAGMENT]; |
| |
| state->dirty.fragment_descriptors |= state->dirty.blend_constants; |
| |
| if (state->dirty.fragment_descriptors) { |
| result = pvr_setup_descriptor_mappings( |
| cmd_buffer, |
| PVR_STAGE_ALLOCATION_FRAGMENT, |
| &state->gfx_pipeline->fragment_shader_state.descriptor_state, |
| NULL, |
| &state->pds_fragment_descriptor_data_offset); |
| if (result != VK_SUCCESS) { |
| mesa_loge("Could not setup fragment descriptor mappings."); |
| return result; |
| } |
| } |
| |
| if (state->dirty.vertex_descriptors) { |
| uint32_t pds_vertex_descriptor_data_offset; |
| |
| result = pvr_setup_descriptor_mappings( |
| cmd_buffer, |
| PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY, |
| &state->gfx_pipeline->vertex_shader_state.descriptor_state, |
| NULL, |
| &pds_vertex_descriptor_data_offset); |
| if (result != VK_SUCCESS) { |
| mesa_loge("Could not setup vertex descriptor mappings."); |
| return result; |
| } |
| |
| pvr_emit_dirty_pds_state(cmd_buffer, |
| sub_cmd, |
| pds_vertex_descriptor_data_offset); |
| } |
| |
| pvr_emit_dirty_ppp_state(cmd_buffer, sub_cmd); |
| pvr_emit_dirty_vdm_state(cmd_buffer, sub_cmd); |
| |
| state->dirty.gfx_desc_dirty = false; |
| state->dirty.blend_constants = false; |
| state->dirty.compare_mask = false; |
| state->dirty.depth_bias = false; |
| state->dirty.draw_base_instance = false; |
| state->dirty.draw_variant = false; |
| state->dirty.fragment_descriptors = false; |
| state->dirty.line_width = false; |
| state->dirty.gfx_pipeline_binding = false; |
| state->dirty.reference = false; |
| state->dirty.scissor = false; |
| state->dirty.isp_userpass = false; |
| state->dirty.vertex_bindings = false; |
| state->dirty.viewport = false; |
| state->dirty.write_mask = false; |
| |
| return VK_SUCCESS; |
| } |
| |
| static uint32_t pvr_get_hw_primitive_topology(VkPrimitiveTopology topology) |
| { |
| switch (topology) { |
| case VK_PRIMITIVE_TOPOLOGY_POINT_LIST: |
| return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_POINT_LIST); |
| case VK_PRIMITIVE_TOPOLOGY_LINE_LIST: |
| return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST); |
| case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP: |
| return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP); |
| case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST: |
| return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST); |
| case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP: |
| return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP); |
| case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN: |
| return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_FAN); |
| case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: |
| return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST_ADJ); |
| case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY: |
| return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP_ADJ); |
| case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY: |
| return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST_ADJ); |
| case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY: |
| return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP_ADJ); |
| case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST: |
| return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_PATCH_LIST); |
| default: |
| unreachable("Undefined primitive topology"); |
| } |
| } |
| |
| /* TODO: Rewrite this in terms of ALIGN_POT() and pvr_cmd_length(). */ |
| /* Aligned to 128 bit for PDS loads / stores */ |
| #define DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE 8 |
| |
| static VkResult |
| pvr_write_draw_indirect_vdm_stream(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_csb *const csb, |
| pvr_dev_addr_t idx_buffer_addr, |
| uint32_t idx_stride, |
| struct PVRX(VDMCTRL_INDEX_LIST0) * list_hdr, |
| struct pvr_buffer *buffer, |
| VkDeviceSize offset, |
| uint32_t count, |
| uint32_t stride) |
| { |
| struct pvr_pds_drawindirect_program pds_prog = { 0 }; |
| uint32_t word0; |
| |
| /* Draw indirect always has index offset and instance count. */ |
| list_hdr->index_offset_present = true; |
| list_hdr->index_instance_count_present = true; |
| |
| pvr_cmd_pack(VDMCTRL_INDEX_LIST0)(&word0, list_hdr); |
| |
| pds_prog.support_base_instance = true; |
| pds_prog.arg_buffer = buffer->dev_addr.addr + offset; |
| pds_prog.index_buffer = idx_buffer_addr.addr; |
| pds_prog.index_block_header = word0; |
| pds_prog.index_stride = idx_stride; |
| pds_prog.num_views = 1U; |
| |
| /* TODO: See if we can pre-upload the code section of all the pds programs |
| * and reuse them here. |
| */ |
| /* Generate and upload the PDS programs (code + data). */ |
| for (uint32_t i = 0U; i < count; i++) { |
| const struct pvr_device_info *dev_info = |
| &cmd_buffer->device->pdevice->dev_info; |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| struct pvr_bo *dummy_bo; |
| uint32_t *dummy_stream; |
| struct pvr_bo *pds_bo; |
| uint32_t *pds_base; |
| uint32_t pds_size; |
| VkResult result; |
| |
| pds_prog.increment_draw_id = (i != 0); |
| |
| if (state->draw_state.draw_indexed) { |
| pvr_pds_generate_draw_elements_indirect(&pds_prog, |
| 0, |
| PDS_GENERATE_SIZES, |
| dev_info); |
| } else { |
| pvr_pds_generate_draw_arrays_indirect(&pds_prog, |
| 0, |
| PDS_GENERATE_SIZES, |
| dev_info); |
| } |
| |
| pds_size = (pds_prog.program.data_size_aligned + |
| pds_prog.program.code_size_aligned) |
| << 2; |
| |
| result = pvr_cmd_buffer_alloc_mem(cmd_buffer, |
| cmd_buffer->device->heaps.pds_heap, |
| pds_size, |
| PVR_BO_ALLOC_FLAG_CPU_MAPPED, |
| &pds_bo); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| pds_base = pds_bo->bo->map; |
| memcpy(pds_base, |
| pds_prog.program.code, |
| pds_prog.program.code_size_aligned << 2); |
| |
| if (state->draw_state.draw_indexed) { |
| pvr_pds_generate_draw_elements_indirect( |
| &pds_prog, |
| pds_base + pds_prog.program.code_size_aligned, |
| PDS_GENERATE_DATA_SEGMENT, |
| dev_info); |
| } else { |
| pvr_pds_generate_draw_arrays_indirect( |
| &pds_prog, |
| pds_base + pds_prog.program.code_size_aligned, |
| PDS_GENERATE_DATA_SEGMENT, |
| dev_info); |
| } |
| |
| pvr_bo_cpu_unmap(cmd_buffer->device, pds_bo); |
| |
| /* Write the VDM state update. */ |
| pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) { |
| state0.usc_target = PVRX(VDMCTRL_USC_TARGET_ANY); |
| |
| state0.pds_temp_size = |
| DIV_ROUND_UP(pds_prog.program.temp_size_aligned << 2, |
| PVRX(VDMCTRL_PDS_STATE0_PDS_TEMP_SIZE_UNIT_SIZE)); |
| |
| state0.pds_data_size = |
| DIV_ROUND_UP(pds_prog.program.data_size_aligned << 2, |
| PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE)); |
| } |
| |
| pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) { |
| const uint32_t data_offset = |
| pds_bo->vma->dev_addr.addr + (pds_prog.program.code_size << 2) - |
| cmd_buffer->device->heaps.pds_heap->base_addr.addr; |
| |
| state1.pds_data_addr = PVR_DEV_ADDR(data_offset); |
| state1.sd_type = PVRX(VDMCTRL_SD_TYPE_PDS); |
| state1.sd_next_type = PVRX(VDMCTRL_SD_TYPE_NONE); |
| } |
| |
| pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) { |
| const uint32_t code_offset = |
| pds_bo->vma->dev_addr.addr - |
| cmd_buffer->device->heaps.pds_heap->base_addr.addr; |
| |
| state2.pds_code_addr = PVR_DEV_ADDR(code_offset); |
| } |
| |
| /* Sync task to ensure the VDM doesn't start reading the dummy blocks |
| * before they are ready. |
| */ |
| pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) { |
| list0.primitive_topology = PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST); |
| } |
| |
| result = pvr_cmd_buffer_alloc_mem(cmd_buffer, |
| cmd_buffer->device->heaps.general_heap, |
| DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE, |
| PVR_BO_ALLOC_FLAG_CPU_MAPPED, |
| &dummy_bo); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| dummy_stream = dummy_bo->bo->map; |
| |
| /* For indexed draw cmds fill in the dummy's header (as it won't change |
| * based on the indirect args) and increment by the in-use size of each |
| * dummy block. |
| */ |
| if (!state->draw_state.draw_indexed) { |
| dummy_stream[0] = word0; |
| dummy_stream += 4; |
| } else { |
| dummy_stream += 5; |
| } |
| |
| /* clang-format off */ |
| pvr_csb_pack (dummy_stream, VDMCTRL_STREAM_RETURN, word); |
| /* clang-format on */ |
| |
| pvr_bo_cpu_unmap(cmd_buffer->device, dummy_bo); |
| |
| /* Stream link to the first dummy which forces the VDM to discard any |
| * prefetched (dummy) control stream. |
| */ |
| pvr_csb_emit (csb, VDMCTRL_STREAM_LINK0, link) { |
| link.with_return = true; |
| link.link_addrmsb = dummy_bo->vma->dev_addr; |
| } |
| |
| pvr_csb_emit (csb, VDMCTRL_STREAM_LINK1, link) { |
| link.link_addrlsb = dummy_bo->vma->dev_addr; |
| } |
| |
| /* Point the pds program to the next argument buffer and the next VDM |
| * dummy buffer. |
| */ |
| pds_prog.arg_buffer += stride; |
| } |
| |
| return VK_SUCCESS; |
| } |
| |
| #undef DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE |
| |
| static void pvr_emit_vdm_index_list(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd_gfx *const sub_cmd, |
| VkPrimitiveTopology topology, |
| uint32_t first_vertex, |
| uint32_t vertex_count, |
| uint32_t first_index, |
| uint32_t index_count, |
| uint32_t instance_count, |
| struct pvr_buffer *buffer, |
| VkDeviceSize offset, |
| uint32_t count, |
| uint32_t stride) |
| { |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| const bool vertex_shader_has_side_effects = |
| state->gfx_pipeline->vertex_shader_state.stage_state.has_side_effects; |
| struct PVRX(VDMCTRL_INDEX_LIST0) |
| list_hdr = { pvr_cmd_header(VDMCTRL_INDEX_LIST0) }; |
| pvr_dev_addr_t index_buffer_addr = PVR_DEV_ADDR_INVALID; |
| struct pvr_csb *const csb = &sub_cmd->control_stream; |
| unsigned int index_stride = 0; |
| |
| list_hdr.primitive_topology = pvr_get_hw_primitive_topology(topology); |
| |
| /* firstInstance is not handled here in the VDM state, it's implemented as |
| * an addition in the PDS vertex fetch using |
| * PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE entry type. |
| */ |
| |
| list_hdr.index_count_present = true; |
| |
| if (instance_count > 1) |
| list_hdr.index_instance_count_present = true; |
| |
| if (first_vertex != 0) |
| list_hdr.index_offset_present = true; |
| |
| if (state->draw_state.draw_indexed) { |
| struct pvr_buffer *buffer = state->index_buffer_binding.buffer; |
| |
| switch (state->index_buffer_binding.type) { |
| case VK_INDEX_TYPE_UINT32: |
| list_hdr.index_size = PVRX(VDMCTRL_INDEX_SIZE_B32); |
| index_stride = 4; |
| break; |
| |
| case VK_INDEX_TYPE_UINT16: |
| list_hdr.index_size = PVRX(VDMCTRL_INDEX_SIZE_B16); |
| index_stride = 2; |
| break; |
| |
| default: |
| unreachable("Invalid index type"); |
| } |
| |
| index_buffer_addr = PVR_DEV_ADDR_OFFSET( |
| buffer->dev_addr, |
| state->index_buffer_binding.offset + first_index * index_stride); |
| |
| list_hdr.index_addr_present = true; |
| |
| /* For indirect draw calls, index buffer address is not embedded into VDM |
| * control stream. |
| */ |
| if (!state->draw_state.draw_indirect) |
| list_hdr.index_base_addrmsb = index_buffer_addr; |
| } |
| |
| list_hdr.degen_cull_enable = |
| PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info, |
| vdm_degenerate_culling) && |
| !vertex_shader_has_side_effects; |
| |
| if (state->draw_state.draw_indirect) { |
| assert(buffer); |
| pvr_write_draw_indirect_vdm_stream(cmd_buffer, |
| csb, |
| index_buffer_addr, |
| index_stride, |
| &list_hdr, |
| buffer, |
| offset, |
| count, |
| stride); |
| return; |
| } |
| |
| pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) { |
| list0 = list_hdr; |
| } |
| |
| if (list_hdr.index_addr_present) { |
| pvr_csb_emit (csb, VDMCTRL_INDEX_LIST1, list1) { |
| list1.index_base_addrlsb = index_buffer_addr; |
| } |
| } |
| |
| if (list_hdr.index_count_present) { |
| pvr_csb_emit (csb, VDMCTRL_INDEX_LIST2, list2) { |
| list2.index_count = vertex_count | index_count; |
| } |
| } |
| |
| if (list_hdr.index_instance_count_present) { |
| pvr_csb_emit (csb, VDMCTRL_INDEX_LIST3, list3) { |
| list3.instance_count = instance_count - 1; |
| } |
| } |
| |
| if (list_hdr.index_offset_present) { |
| pvr_csb_emit (csb, VDMCTRL_INDEX_LIST4, list4) { |
| list4.index_offset = first_vertex; |
| } |
| } |
| |
| /* TODO: See if we need list_words[5-9]. */ |
| } |
| |
| void pvr_CmdDraw(VkCommandBuffer commandBuffer, |
| uint32_t vertexCount, |
| uint32_t instanceCount, |
| uint32_t firstVertex, |
| uint32_t firstInstance) |
| { |
| const struct pvr_cmd_buffer_draw_state draw_state = { |
| .base_vertex = firstVertex, |
| .base_instance = firstInstance, |
| }; |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| pvr_update_draw_state(state, &draw_state); |
| |
| result = pvr_validate_draw_state(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return; |
| |
| /* Write the VDM control stream for the primitive. */ |
| pvr_emit_vdm_index_list(cmd_buffer, |
| &state->current_sub_cmd->gfx, |
| state->gfx_pipeline->input_asm_state.topology, |
| firstVertex, |
| vertexCount, |
| 0U, |
| 0U, |
| instanceCount, |
| NULL, |
| 0U, |
| 0U, |
| 0U); |
| } |
| |
| void pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer, |
| uint32_t indexCount, |
| uint32_t instanceCount, |
| uint32_t firstIndex, |
| int32_t vertexOffset, |
| uint32_t firstInstance) |
| { |
| const struct pvr_cmd_buffer_draw_state draw_state = { |
| .base_vertex = vertexOffset, |
| .base_instance = firstInstance, |
| .draw_indexed = true, |
| }; |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| pvr_update_draw_state(state, &draw_state); |
| |
| result = pvr_validate_draw_state(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return; |
| |
| /* Write the VDM control stream for the primitive. */ |
| pvr_emit_vdm_index_list(cmd_buffer, |
| &state->current_sub_cmd->gfx, |
| state->gfx_pipeline->input_asm_state.topology, |
| vertexOffset, |
| 0, |
| firstIndex, |
| indexCount, |
| instanceCount, |
| NULL, |
| 0U, |
| 0U, |
| 0U); |
| } |
| |
| void pvr_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, |
| VkBuffer _buffer, |
| VkDeviceSize offset, |
| uint32_t drawCount, |
| uint32_t stride) |
| { |
| const struct pvr_cmd_buffer_draw_state draw_state = { |
| .draw_indirect = true, |
| .draw_indexed = true, |
| }; |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer); |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| pvr_update_draw_state(state, &draw_state); |
| |
| result = pvr_validate_draw_state(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return; |
| |
| /* Write the VDM control stream for the primitive. */ |
| pvr_emit_vdm_index_list(cmd_buffer, |
| &state->current_sub_cmd->gfx, |
| state->gfx_pipeline->input_asm_state.topology, |
| 0, |
| 0, |
| 0, |
| 0, |
| 0, |
| buffer, |
| offset, |
| drawCount, |
| stride); |
| } |
| |
| void pvr_CmdDrawIndirect(VkCommandBuffer commandBuffer, |
| VkBuffer _buffer, |
| VkDeviceSize offset, |
| uint32_t drawCount, |
| uint32_t stride) |
| { |
| const struct pvr_cmd_buffer_draw_state draw_state = { |
| .draw_indirect = true, |
| }; |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer); |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| pvr_update_draw_state(state, &draw_state); |
| |
| result = pvr_validate_draw_state(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return; |
| |
| /* Write the VDM control stream for the primitive. */ |
| pvr_emit_vdm_index_list(cmd_buffer, |
| &state->current_sub_cmd->gfx, |
| state->gfx_pipeline->input_asm_state.topology, |
| 0, |
| 0, |
| 0, |
| 0, |
| 0, |
| buffer, |
| offset, |
| drawCount, |
| stride); |
| } |
| |
| static VkResult |
| pvr_resolve_unemitted_resolve_attachments(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_render_pass_info *info) |
| { |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| const struct pvr_renderpass_hwsetup_render *hw_render = |
| &state->render_pass_info.pass->hw_setup->renders[info->current_hw_subpass]; |
| |
| for (uint32_t i = 0U; i < hw_render->eot_surface_count; i++) { |
| const struct pvr_renderpass_hwsetup_eot_surface *surface = |
| &hw_render->eot_surfaces[i]; |
| const uint32_t color_attach_idx = surface->src_attachment_idx; |
| const uint32_t resolve_attach_idx = surface->attachment_idx; |
| VkImageSubresourceLayers src_subresource; |
| VkImageSubresourceLayers dst_subresource; |
| struct pvr_image_view *dst_view; |
| struct pvr_image_view *src_view; |
| VkFormat src_format; |
| VkFormat dst_format; |
| VkImageCopy2 region; |
| VkResult result; |
| |
| if (!surface->need_resolve || |
| surface->resolve_type != PVR_RESOLVE_TYPE_TRANSFER) |
| continue; |
| |
| dst_view = info->attachments[resolve_attach_idx]; |
| src_view = info->attachments[color_attach_idx]; |
| |
| src_subresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| src_subresource.mipLevel = src_view->vk.base_mip_level; |
| src_subresource.baseArrayLayer = src_view->vk.base_array_layer; |
| src_subresource.layerCount = src_view->vk.layer_count; |
| |
| dst_subresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; |
| dst_subresource.mipLevel = dst_view->vk.base_mip_level; |
| dst_subresource.baseArrayLayer = dst_view->vk.base_array_layer; |
| dst_subresource.layerCount = dst_view->vk.layer_count; |
| |
| region.srcOffset = (VkOffset3D){ info->render_area.offset.x, |
| info->render_area.offset.y, |
| 0 }; |
| region.dstOffset = (VkOffset3D){ info->render_area.offset.x, |
| info->render_area.offset.y, |
| 0 }; |
| region.extent = (VkExtent3D){ info->render_area.extent.width, |
| info->render_area.extent.height, |
| 1 }; |
| |
| region.srcSubresource = src_subresource; |
| region.dstSubresource = dst_subresource; |
| |
| /* TODO: if ERN_46863 is supported, Depth and stencil are sampled |
| * separately from images with combined depth+stencil. Add logic here to |
| * handle it using appropriate format from image view. |
| */ |
| src_format = src_view->vk.image->format; |
| dst_format = dst_view->vk.image->format; |
| src_view->vk.image->format = src_view->vk.format; |
| dst_view->vk.image->format = dst_view->vk.format; |
| |
| result = pvr_copy_or_resolve_color_image_region( |
| cmd_buffer, |
| vk_to_pvr_image(src_view->vk.image), |
| vk_to_pvr_image(dst_view->vk.image), |
| ®ion); |
| |
| src_view->vk.image->format = src_format; |
| dst_view->vk.image->format = dst_format; |
| |
| state->current_sub_cmd->flags |= |
| PVR_SUB_COMMAND_FLAG_WAIT_ON_PREVIOUS_FRAG; |
| |
| if (result != VK_SUCCESS) |
| return result; |
| } |
| |
| return pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| } |
| |
| void pvr_CmdEndRenderPass2(VkCommandBuffer commandBuffer, |
| const VkSubpassEndInfo *pSubpassEndInfo) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| struct pvr_image_view **attachments; |
| VkClearValue *clear_values; |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| assert(state->render_pass_info.pass); |
| assert(state->render_pass_info.framebuffer); |
| |
| /* TODO: Investigate why pvr_cmd_buffer_end_sub_cmd/EndSubCommand is called |
| * twice in this path, one here and one from |
| * pvr_resolve_unemitted_resolve_attachments. |
| */ |
| result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return; |
| |
| result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer, |
| &state->render_pass_info); |
| if (result != VK_SUCCESS) |
| return; |
| |
| /* Save the required fields before clearing render_pass_info struct. */ |
| attachments = state->render_pass_info.attachments; |
| clear_values = state->render_pass_info.clear_values; |
| |
| memset(&state->render_pass_info, 0, sizeof(state->render_pass_info)); |
| |
| state->render_pass_info.attachments = attachments; |
| state->render_pass_info.clear_values = clear_values; |
| } |
| |
| static VkResult |
| pvr_execute_deferred_cmd_buffer(struct pvr_cmd_buffer *cmd_buffer, |
| const struct pvr_cmd_buffer *sec_cmd_buffer) |
| { |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| const uint32_t prim_db_elems = |
| util_dynarray_num_elements(&cmd_buffer->depth_bias_array, |
| __typeof__(state->dynamic.common.depth_bias)); |
| const uint32_t prim_scissor_elems = |
| util_dynarray_num_elements(&cmd_buffer->scissor_array, |
| __typeof__(cmd_buffer->scissor_words)); |
| const uint32_t sec_db_size = |
| util_dynarray_num_elements(&sec_cmd_buffer->depth_bias_array, char); |
| const uint32_t sec_scissor_size = |
| util_dynarray_num_elements(&sec_cmd_buffer->scissor_array, char); |
| uint32_t *addr; |
| |
| util_dynarray_foreach (&sec_cmd_buffer->deferred_csb_commands, |
| struct pvr_deferred_cs_command, |
| cmd) { |
| switch (cmd->type) { |
| case PVR_DEFERRED_CS_COMMAND_TYPE_DBSC: { |
| const uint32_t scissor_idx = |
| prim_scissor_elems + cmd->dbsc.state.scissor_index; |
| const uint32_t db_idx = |
| prim_db_elems + cmd->dbsc.state.depthbias_index; |
| const uint32_t num_dwords = |
| pvr_cmd_length(TA_STATE_HEADER) + pvr_cmd_length(TA_STATE_ISPDBSC); |
| uint32_t ppp_state[num_dwords]; |
| struct pvr_bo *pvr_bo; |
| VkResult result; |
| |
| pvr_csb_pack (&ppp_state[0], TA_STATE_HEADER, header) { |
| header.pres_ispctl_dbsc = true; |
| } |
| |
| pvr_csb_pack (&ppp_state[1], TA_STATE_ISPDBSC, ispdbsc) { |
| ispdbsc.dbindex = db_idx; |
| ispdbsc.scindex = scissor_idx; |
| } |
| |
| result = pvr_cmd_buffer_upload_general(cmd_buffer, |
| &ppp_state[0], |
| sizeof(ppp_state), |
| &pvr_bo); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| pvr_csb_pack (&cmd->dbsc.vdm_state[0], VDMCTRL_PPP_STATE0, state) { |
| state.word_count = num_dwords; |
| state.addrmsb = pvr_bo->vma->dev_addr; |
| } |
| |
| pvr_csb_pack (&cmd->dbsc.vdm_state[1], VDMCTRL_PPP_STATE1, state) { |
| state.addrlsb = pvr_bo->vma->dev_addr; |
| } |
| |
| break; |
| } |
| |
| case PVR_DEFERRED_CS_COMMAND_TYPE_DBSC2: { |
| const uint32_t scissor_idx = |
| prim_scissor_elems + cmd->dbsc2.state.scissor_index; |
| const uint32_t db_idx = |
| prim_db_elems + cmd->dbsc2.state.depthbias_index; |
| |
| assert(cmd->dbsc2.ppp_cs_bo->bo->map); |
| |
| addr = cmd->dbsc2.ppp_cs_bo->bo->map + cmd->dbsc2.patch_offset; |
| |
| pvr_csb_pack (addr, TA_STATE_ISPDBSC, ispdbsc) { |
| ispdbsc.dbindex = db_idx; |
| ispdbsc.scindex = scissor_idx; |
| } |
| |
| break; |
| } |
| |
| default: |
| unreachable("Invalid deferred control stream command type."); |
| break; |
| } |
| } |
| |
| addr = |
| util_dynarray_grow_bytes(&cmd_buffer->depth_bias_array, 1, sec_db_size); |
| memcpy(addr, |
| util_dynarray_begin(&sec_cmd_buffer->depth_bias_array), |
| sec_db_size); |
| |
| addr = |
| util_dynarray_grow_bytes(&cmd_buffer->scissor_array, 1, sec_scissor_size); |
| memcpy(addr, |
| util_dynarray_begin(&sec_cmd_buffer->scissor_array), |
| sec_scissor_size); |
| |
| cmd_buffer->state.dirty.depth_bias = true; |
| memset(&cmd_buffer->scissor_words[0], 0, sizeof(cmd_buffer->scissor_words)); |
| |
| return VK_SUCCESS; |
| } |
| |
| /* Caller needs to make sure that it ends the current sub_cmd. This function |
| * only creates a copy of sec_sub_cmd and links it to the cmd_buffer's |
| * sub_cmd list. |
| */ |
| static VkResult pvr_execute_sub_cmd(struct pvr_cmd_buffer *cmd_buffer, |
| struct pvr_sub_cmd *sec_sub_cmd) |
| { |
| struct pvr_sub_cmd *primary_sub_cmd = |
| vk_zalloc(&cmd_buffer->vk.pool->alloc, |
| sizeof(*primary_sub_cmd), |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); |
| if (!primary_sub_cmd) { |
| cmd_buffer->state.status = |
| vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY); |
| return cmd_buffer->state.status; |
| } |
| |
| primary_sub_cmd->type = sec_sub_cmd->type; |
| primary_sub_cmd->owned = false; |
| primary_sub_cmd->flags = sec_sub_cmd->flags; |
| |
| list_addtail(&primary_sub_cmd->link, &cmd_buffer->sub_cmds); |
| |
| switch (sec_sub_cmd->type) { |
| case PVR_SUB_CMD_TYPE_GRAPHICS: |
| primary_sub_cmd->gfx = sec_sub_cmd->gfx; |
| break; |
| |
| case PVR_SUB_CMD_TYPE_COMPUTE: |
| primary_sub_cmd->compute = sec_sub_cmd->compute; |
| break; |
| |
| case PVR_SUB_CMD_TYPE_TRANSFER: |
| primary_sub_cmd->transfer = sec_sub_cmd->transfer; |
| break; |
| |
| case PVR_SUB_CMD_TYPE_EVENT: |
| primary_sub_cmd->event = sec_sub_cmd->event; |
| break; |
| |
| default: |
| mesa_loge("Unsupported sub-command type %d", primary_sub_cmd->type); |
| break; |
| } |
| |
| return VK_SUCCESS; |
| } |
| |
| static void |
| pvr_execute_graphics_cmd_buffer(struct pvr_cmd_buffer *cmd_buffer, |
| const struct pvr_cmd_buffer *sec_cmd_buffer) |
| { |
| const struct pvr_device_info *dev_info = |
| &cmd_buffer->device->pdevice->dev_info; |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| struct pvr_sub_cmd *primary_sub_cmd = state->current_sub_cmd; |
| VkResult result; |
| |
| if (list_is_empty(&sec_cmd_buffer->sub_cmds)) |
| return; |
| |
| list_for_each_entry (struct pvr_sub_cmd, |
| sec_sub_cmd, |
| &sec_cmd_buffer->sub_cmds, |
| link) { |
| /* Only graphics secondary execution supported within a renderpass. */ |
| assert(sec_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS); |
| |
| if (!sec_sub_cmd->gfx.empty_cmd) |
| primary_sub_cmd->gfx.empty_cmd = false; |
| |
| if (pvr_cmd_uses_deferred_cs_cmds(sec_cmd_buffer)) { |
| /* TODO: In case if secondary buffer is created with |
| * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT, then we patch the |
| * stream and copy it to primary stream using pvr_csb_copy below. |
| * This will need locking if the same secondary command buffer is |
| * executed in multiple primary buffers at the same time. |
| */ |
| result = pvr_execute_deferred_cmd_buffer(cmd_buffer, sec_cmd_buffer); |
| if (result != VK_SUCCESS) |
| return; |
| |
| result = pvr_csb_copy(&primary_sub_cmd->gfx.control_stream, |
| &sec_sub_cmd->gfx.control_stream); |
| if (result != VK_SUCCESS) { |
| cmd_buffer->state.status = result; |
| return; |
| } |
| } else { |
| result = pvr_execute_deferred_cmd_buffer(cmd_buffer, sec_cmd_buffer); |
| if (result != VK_SUCCESS) |
| return; |
| |
| pvr_csb_emit_link( |
| &primary_sub_cmd->gfx.control_stream, |
| pvr_csb_get_start_address(&sec_sub_cmd->gfx.control_stream), |
| true); |
| } |
| |
| if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info, |
| compute_overlap)) { |
| primary_sub_cmd->gfx.job.disable_compute_overlap |= |
| sec_sub_cmd->gfx.job.disable_compute_overlap; |
| } |
| |
| primary_sub_cmd->gfx.max_tiles_in_flight = |
| MIN2(primary_sub_cmd->gfx.max_tiles_in_flight, |
| sec_sub_cmd->gfx.max_tiles_in_flight); |
| |
| /* Pass loaded depth/stencil usage from secondary command buffer. */ |
| if (sec_sub_cmd->gfx.depth_usage == PVR_DEPTH_STENCIL_USAGE_NEEDED) |
| primary_sub_cmd->gfx.depth_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED; |
| |
| if (sec_sub_cmd->gfx.stencil_usage == PVR_DEPTH_STENCIL_USAGE_NEEDED) |
| primary_sub_cmd->gfx.stencil_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED; |
| |
| /* Pass depth/stencil modification state from secondary command buffer. */ |
| if (sec_sub_cmd->gfx.modifies_depth) |
| primary_sub_cmd->gfx.modifies_depth = true; |
| |
| if (sec_sub_cmd->gfx.modifies_stencil) |
| primary_sub_cmd->gfx.modifies_stencil = true; |
| |
| pvr_finishme( |
| "Copy barrier related load/store information from secondary to primary command buffer."); |
| |
| if (!PVR_HAS_FEATURE(dev_info, gs_rta_support)) |
| pvr_finishme("Unimplemented path."); |
| } |
| } |
| |
| void pvr_CmdExecuteCommands(VkCommandBuffer commandBuffer, |
| uint32_t commandBufferCount, |
| const VkCommandBuffer *pCommandBuffers) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| struct pvr_cmd_buffer *last_cmd_buffer; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); |
| |
| /* Reset the CPU copy of the most recent PPP state of the primary command |
| * buffer. |
| * |
| * The next draw call in the primary after CmdExecuteCommands may send |
| * redundant state, if it all goes in the same geom job. |
| * |
| * Can't just copy state from the secondary because the recording state of |
| * the secondary command buffers would have been deleted at this point. |
| */ |
| pvr_reset_graphics_dirty_state(state, false); |
| |
| if (state->current_sub_cmd && |
| state->current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS) { |
| for (uint32_t i = 0; i < commandBufferCount; i++) { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, sec_cmd_buffer, pCommandBuffers[i]); |
| |
| assert(sec_cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); |
| |
| pvr_execute_graphics_cmd_buffer(cmd_buffer, sec_cmd_buffer); |
| } |
| |
| last_cmd_buffer = |
| pvr_cmd_buffer_from_handle(pCommandBuffers[commandBufferCount - 1]); |
| |
| /* Set barriers from final command secondary command buffer. */ |
| for (uint32_t i = 0; i != PVR_NUM_SYNC_PIPELINE_STAGES; i++) { |
| state->barriers_needed[i] |= |
| last_cmd_buffer->state.barriers_needed[i] & |
| PVR_PIPELINE_STAGE_ALL_GRAPHICS_BITS; |
| } |
| } else { |
| for (uint32_t i = 0; i < commandBufferCount; i++) { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, sec_cmd_buffer, pCommandBuffers[i]); |
| VkResult result; |
| |
| assert(sec_cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); |
| |
| result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return; |
| |
| list_for_each_entry_safe (struct pvr_sub_cmd, |
| sec_sub_cmd, |
| &sec_cmd_buffer->sub_cmds, |
| link) { |
| result = pvr_execute_sub_cmd(cmd_buffer, sec_sub_cmd); |
| if (result != VK_SUCCESS) |
| return; |
| } |
| } |
| |
| last_cmd_buffer = |
| pvr_cmd_buffer_from_handle(pCommandBuffers[commandBufferCount - 1]); |
| |
| memcpy(state->barriers_needed, |
| last_cmd_buffer->state.barriers_needed, |
| sizeof(state->barriers_needed)); |
| } |
| } |
| |
| static void pvr_insert_transparent_obj(struct pvr_cmd_buffer *const cmd_buffer, |
| struct pvr_sub_cmd_gfx *const sub_cmd) |
| { |
| struct pvr_device *const device = cmd_buffer->device; |
| /* Yes we want a copy. The user could be recording multiple command buffers |
| * in parallel so writing the template in place could cause problems. |
| */ |
| struct pvr_static_clear_ppp_template clear = |
| device->static_clear_state.ppp_templates[PVR_STATIC_CLEAR_COLOR_BIT]; |
| uint32_t pds_state[PVR_STATIC_CLEAR_PDS_STATE_COUNT] = { 0 }; |
| struct pvr_csb *csb = &sub_cmd->control_stream; |
| struct pvr_bo *ppp_bo; |
| |
| assert(clear.requires_pds_state); |
| |
| /* Patch the template. */ |
| |
| pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SHADERBASE], |
| TA_STATE_PDS_SHADERBASE, |
| shaderbase) { |
| shaderbase.addr = PVR_DEV_ADDR(device->nop_program.pds.data_offset); |
| } |
| |
| clear.config.pds_state = &pds_state; |
| |
| clear.config.ispctl.upass = cmd_buffer->state.render_pass_info.isp_userpass; |
| |
| /* Emit PPP state from template. */ |
| |
| pvr_emit_ppp_from_template(csb, &clear, &ppp_bo); |
| list_add(&ppp_bo->link, &cmd_buffer->bo_list); |
| |
| /* Emit VDM state. */ |
| |
| static_assert(sizeof(device->static_clear_state.large_clear_vdm_words) >= |
| PVR_CLEAR_VDM_STATE_DWORD_COUNT * sizeof(uint32_t), |
| "Large clear VDM control stream word length mismatch"); |
| static_assert(sizeof(device->static_clear_state.vdm_words) == |
| PVR_CLEAR_VDM_STATE_DWORD_COUNT * sizeof(uint32_t), |
| "Clear VDM control stream word length mismatch"); |
| |
| pvr_emit_clear_words(cmd_buffer, sub_cmd); |
| |
| /* Reset graphics state. */ |
| pvr_reset_graphics_dirty_state(&cmd_buffer->state, false); |
| } |
| |
| static inline struct pvr_render_subpass * |
| pvr_get_current_subpass(const struct pvr_cmd_buffer_state *const state) |
| { |
| const uint32_t subpass_idx = state->render_pass_info.subpass_idx; |
| |
| return &state->render_pass_info.pass->subpasses[subpass_idx]; |
| } |
| |
| void pvr_CmdNextSubpass2(VkCommandBuffer commandBuffer, |
| const VkSubpassBeginInfo *pSubpassBeginInfo, |
| const VkSubpassEndInfo *pSubpassEndInfo) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| struct pvr_render_pass_info *rp_info = &state->render_pass_info; |
| const struct pvr_renderpass_hwsetup_subpass *hw_subpass; |
| struct pvr_renderpass_hwsetup_render *next_hw_render; |
| const struct pvr_render_pass *pass = rp_info->pass; |
| const struct pvr_renderpass_hw_map *current_map; |
| const struct pvr_renderpass_hw_map *next_map; |
| struct pvr_load_op *hw_subpass_load_op; |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| current_map = &pass->hw_setup->subpass_map[rp_info->subpass_idx]; |
| next_map = &pass->hw_setup->subpass_map[rp_info->subpass_idx + 1]; |
| next_hw_render = &pass->hw_setup->renders[next_map->render]; |
| |
| if (current_map->render != next_map->render) { |
| result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return; |
| |
| result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer, rp_info); |
| if (result != VK_SUCCESS) |
| return; |
| |
| rp_info->current_hw_subpass = next_map->render; |
| |
| result = |
| pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS); |
| if (result != VK_SUCCESS) |
| return; |
| |
| rp_info->enable_bg_tag = false; |
| rp_info->process_empty_tiles = false; |
| |
| /* If this subpass contains any load ops the HW Background Object must be |
| * run to do the clears/loads. |
| */ |
| if (next_hw_render->color_init_count > 0) { |
| rp_info->enable_bg_tag = true; |
| |
| for (uint32_t i = 0; i < next_hw_render->color_init_count; i++) { |
| /* Empty tiles need to be cleared too. */ |
| if (next_hw_render->color_init[i].op == |
| VK_ATTACHMENT_LOAD_OP_CLEAR) { |
| rp_info->process_empty_tiles = true; |
| break; |
| } |
| } |
| } |
| |
| /* Set isp_userpass to zero for new hw_render. This will be used to set |
| * ROGUE_CR_ISP_CTL::upass_start. |
| */ |
| rp_info->isp_userpass = 0; |
| } |
| |
| hw_subpass = &next_hw_render->subpasses[next_map->subpass]; |
| hw_subpass_load_op = hw_subpass->load_op; |
| |
| if (hw_subpass_load_op) { |
| result = pvr_cs_write_load_op(cmd_buffer, |
| &state->current_sub_cmd->gfx, |
| hw_subpass_load_op, |
| rp_info->isp_userpass); |
| } |
| |
| /* Pipelines are created for a particular subpass so unbind but leave the |
| * vertex and descriptor bindings intact as they are orthogonal to the |
| * subpass. |
| */ |
| state->gfx_pipeline = NULL; |
| |
| /* User-pass spawn is 4 bits so if the driver has to wrap it, it will emit a |
| * full screen transparent object to flush all tags up until now, then the |
| * user-pass spawn value will implicitly be reset to 0 because |
| * pvr_render_subpass::isp_userpass values are stored ANDed with |
| * ROGUE_CR_ISP_CTL_UPASS_START_SIZE_MAX. |
| */ |
| /* If hw_subpass_load_op is valid then pvr_write_load_op_control_stream |
| * has already done a full-screen transparent object. |
| */ |
| if (rp_info->isp_userpass == PVRX(CR_ISP_CTL_UPASS_START_SIZE_MAX) && |
| !hw_subpass_load_op) { |
| pvr_insert_transparent_obj(cmd_buffer, &state->current_sub_cmd->gfx); |
| } |
| |
| rp_info->subpass_idx++; |
| |
| rp_info->isp_userpass = pass->subpasses[rp_info->subpass_idx].isp_userpass; |
| state->dirty.isp_userpass = true; |
| |
| rp_info->pipeline_bind_point = |
| pass->subpasses[rp_info->subpass_idx].pipeline_bind_point; |
| |
| pvr_stash_depth_format(state, &state->current_sub_cmd->gfx); |
| } |
| |
| static bool |
| pvr_stencil_has_self_dependency(const struct pvr_cmd_buffer_state *const state) |
| { |
| const struct pvr_render_subpass *const current_subpass = |
| pvr_get_current_subpass(state); |
| const uint32_t *const input_attachments = current_subpass->input_attachments; |
| |
| /* We only need to check the current software subpass as we don't support |
| * merging to/from a subpass with self-dep stencil. |
| */ |
| |
| for (uint32_t i = 0; i < current_subpass->input_count; i++) { |
| if (input_attachments[i] == *current_subpass->depth_stencil_attachment) |
| return true; |
| } |
| |
| return false; |
| } |
| |
| static bool pvr_is_stencil_store_load_needed( |
| const struct pvr_cmd_buffer_state *const state, |
| VkPipelineStageFlags2 vk_src_stage_mask, |
| VkPipelineStageFlags2 vk_dst_stage_mask, |
| uint32_t memory_barrier_count, |
| const VkMemoryBarrier2 *const memory_barriers, |
| uint32_t image_barrier_count, |
| const VkImageMemoryBarrier2 *const image_barriers) |
| { |
| const uint32_t fragment_test_stages = |
| VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | |
| VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT; |
| const struct pvr_render_pass *const pass = state->render_pass_info.pass; |
| const struct pvr_renderpass_hwsetup_render *hw_render; |
| struct pvr_image_view **const attachments = |
| state->render_pass_info.attachments; |
| const struct pvr_image_view *attachment; |
| uint32_t hw_render_idx; |
| |
| if (!pass) |
| return false; |
| |
| hw_render_idx = state->current_sub_cmd->gfx.hw_render_idx; |
| hw_render = &pass->hw_setup->renders[hw_render_idx]; |
| attachment = attachments[hw_render->ds_attach_idx]; |
| |
| if (!(vk_src_stage_mask & fragment_test_stages) && |
| vk_dst_stage_mask & VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT) |
| return false; |
| |
| if (hw_render->ds_attach_idx == VK_ATTACHMENT_UNUSED) |
| return false; |
| |
| for (uint32_t i = 0; i < memory_barrier_count; i++) { |
| const uint32_t stencil_write_bit = |
| VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; |
| const uint32_t input_attachment_read_bit = |
| VK_ACCESS_INPUT_ATTACHMENT_READ_BIT; |
| |
| if (!(memory_barriers[i].srcAccessMask & stencil_write_bit)) |
| continue; |
| |
| if (!(memory_barriers[i].dstAccessMask & input_attachment_read_bit)) |
| continue; |
| |
| return pvr_stencil_has_self_dependency(state); |
| } |
| |
| for (uint32_t i = 0; i < image_barrier_count; i++) { |
| PVR_FROM_HANDLE(pvr_image, image, image_barriers[i].image); |
| const uint32_t stencil_bit = VK_IMAGE_ASPECT_STENCIL_BIT; |
| |
| if (!(image_barriers[i].subresourceRange.aspectMask & stencil_bit)) |
| continue; |
| |
| if (attachment && image != vk_to_pvr_image(attachment->vk.image)) |
| continue; |
| |
| if (!vk_format_has_stencil(image->vk.format)) |
| continue; |
| |
| return pvr_stencil_has_self_dependency(state); |
| } |
| |
| return false; |
| } |
| |
| static VkResult |
| pvr_cmd_buffer_insert_mid_frag_barrier_event(struct pvr_cmd_buffer *cmd_buffer, |
| uint32_t src_stage_mask, |
| uint32_t dst_stage_mask) |
| { |
| struct pvr_sub_cmd_event *sub_cmd; |
| VkResult result; |
| |
| assert(cmd_buffer->state.current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS); |
| |
| cmd_buffer->state.current_sub_cmd->gfx.empty_cmd = false; |
| |
| pvr_finishme("Handle mid frag barrier stencil store."); |
| |
| pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| sub_cmd = &cmd_buffer->state.current_sub_cmd->event; |
| |
| sub_cmd->type = PVR_EVENT_TYPE_BARRIER; |
| sub_cmd->barrier.in_render_pass = true; |
| sub_cmd->barrier.wait_for_stage_mask = src_stage_mask; |
| sub_cmd->barrier.wait_at_stage_mask = dst_stage_mask; |
| |
| pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS); |
| |
| pvr_finishme("Handle mid frag barrier color attachment load."); |
| |
| return VK_SUCCESS; |
| } |
| |
| static VkResult |
| pvr_cmd_buffer_insert_barrier_event(struct pvr_cmd_buffer *cmd_buffer, |
| uint32_t src_stage_mask, |
| uint32_t dst_stage_mask) |
| { |
| struct pvr_sub_cmd_event *sub_cmd; |
| VkResult result; |
| |
| result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| sub_cmd = &cmd_buffer->state.current_sub_cmd->event; |
| |
| sub_cmd->type = PVR_EVENT_TYPE_BARRIER; |
| sub_cmd->barrier.wait_for_stage_mask = src_stage_mask; |
| sub_cmd->barrier.wait_at_stage_mask = dst_stage_mask; |
| |
| return pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| } |
| |
| /* This is just enough to handle vkCmdPipelineBarrier(). |
| * TODO: Complete? |
| */ |
| void pvr_CmdPipelineBarrier2(VkCommandBuffer commandBuffer, |
| const VkDependencyInfo *pDependencyInfo) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; |
| const struct pvr_render_pass *const render_pass = |
| state->render_pass_info.pass; |
| VkPipelineStageFlags vk_src_stage_mask = 0U; |
| VkPipelineStageFlags vk_dst_stage_mask = 0U; |
| bool is_stencil_store_load_needed; |
| uint32_t required_stage_mask = 0U; |
| uint32_t src_stage_mask; |
| uint32_t dst_stage_mask; |
| bool is_barrier_needed; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++) { |
| vk_src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask; |
| vk_dst_stage_mask |= pDependencyInfo->pMemoryBarriers[i].dstStageMask; |
| } |
| |
| for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++) { |
| vk_src_stage_mask |= |
| pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask; |
| vk_dst_stage_mask |= |
| pDependencyInfo->pBufferMemoryBarriers[i].dstStageMask; |
| } |
| |
| for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) { |
| vk_src_stage_mask |= |
| pDependencyInfo->pImageMemoryBarriers[i].srcStageMask; |
| vk_dst_stage_mask |= |
| pDependencyInfo->pImageMemoryBarriers[i].dstStageMask; |
| } |
| |
| src_stage_mask = pvr_stage_mask_src(vk_src_stage_mask); |
| dst_stage_mask = pvr_stage_mask_dst(vk_dst_stage_mask); |
| |
| for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) { |
| if (!(dst_stage_mask & BITFIELD_BIT(stage))) |
| continue; |
| |
| required_stage_mask |= state->barriers_needed[stage]; |
| } |
| |
| src_stage_mask &= required_stage_mask; |
| for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) { |
| if (!(dst_stage_mask & BITFIELD_BIT(stage))) |
| continue; |
| |
| state->barriers_needed[stage] &= ~src_stage_mask; |
| } |
| |
| if (src_stage_mask == 0 || dst_stage_mask == 0) { |
| is_barrier_needed = false; |
| } else if (src_stage_mask == PVR_PIPELINE_STAGE_GEOM_BIT && |
| dst_stage_mask == PVR_PIPELINE_STAGE_FRAG_BIT) { |
| /* This is implicit so no need to barrier. */ |
| is_barrier_needed = false; |
| } else if (src_stage_mask == dst_stage_mask && |
| util_bitcount(src_stage_mask) == 1) { |
| struct pvr_sub_cmd *const current_sub_cmd = state->current_sub_cmd; |
| |
| switch (src_stage_mask) { |
| case PVR_PIPELINE_STAGE_FRAG_BIT: |
| is_barrier_needed = true; |
| |
| if (!render_pass) |
| break; |
| |
| assert(current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS); |
| |
| /* Flush all fragment work up to this point. */ |
| pvr_insert_transparent_obj(cmd_buffer, ¤t_sub_cmd->gfx); |
| break; |
| |
| case PVR_PIPELINE_STAGE_COMPUTE_BIT: |
| is_barrier_needed = false; |
| |
| if (!current_sub_cmd || |
| current_sub_cmd->type != PVR_SUB_CMD_TYPE_COMPUTE) { |
| break; |
| } |
| |
| /* Multiple dispatches can be merged into a single job. When back to |
| * back dispatches have a sequential dependency (CDM -> CDM pipeline |
| * barrier) we need to do the following. |
| * - Dispatch a kernel which fences all previous memory writes and |
| * flushes the MADD cache. |
| * - Issue a CDM fence which ensures all previous tasks emitted by |
| * the CDM are completed before starting anything new. |
| */ |
| |
| /* Issue Data Fence, Wait for Data Fence (IDFWDF) makes the PDS wait |
| * for data. |
| */ |
| pvr_compute_generate_idfwdf(cmd_buffer, ¤t_sub_cmd->compute); |
| |
| pvr_compute_generate_fence(cmd_buffer, |
| ¤t_sub_cmd->compute, |
| false); |
| break; |
| |
| default: |
| is_barrier_needed = false; |
| break; |
| }; |
| } else { |
| is_barrier_needed = true; |
| } |
| |
| is_stencil_store_load_needed = |
| pvr_is_stencil_store_load_needed(state, |
| vk_src_stage_mask, |
| vk_dst_stage_mask, |
| pDependencyInfo->memoryBarrierCount, |
| pDependencyInfo->pMemoryBarriers, |
| pDependencyInfo->imageMemoryBarrierCount, |
| pDependencyInfo->pImageMemoryBarriers); |
| |
| if (is_stencil_store_load_needed) { |
| VkResult result; |
| |
| result = pvr_cmd_buffer_insert_mid_frag_barrier_event(cmd_buffer, |
| src_stage_mask, |
| dst_stage_mask); |
| if (result != VK_SUCCESS) |
| mesa_loge("Failed to insert mid frag barrier event."); |
| } else { |
| if (is_barrier_needed) { |
| VkResult result; |
| |
| result = pvr_cmd_buffer_insert_barrier_event(cmd_buffer, |
| src_stage_mask, |
| dst_stage_mask); |
| if (result != VK_SUCCESS) |
| mesa_loge("Failed to insert pipeline barrier event."); |
| } |
| } |
| } |
| |
| void pvr_CmdResetEvent2(VkCommandBuffer commandBuffer, |
| VkEvent _event, |
| VkPipelineStageFlags2 stageMask) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| PVR_FROM_HANDLE(pvr_event, event, _event); |
| struct pvr_sub_cmd_event *sub_cmd; |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT); |
| if (result != VK_SUCCESS) |
| return; |
| |
| sub_cmd = &cmd_buffer->state.current_sub_cmd->event; |
| |
| sub_cmd->type = PVR_EVENT_TYPE_RESET; |
| sub_cmd->reset.event = event; |
| sub_cmd->reset.wait_for_stage_mask = pvr_stage_mask_src(stageMask); |
| |
| pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| } |
| |
| void pvr_CmdSetEvent2(VkCommandBuffer commandBuffer, |
| VkEvent _event, |
| const VkDependencyInfo *pDependencyInfo) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| PVR_FROM_HANDLE(pvr_event, event, _event); |
| VkPipelineStageFlags2 stage_mask = 0; |
| struct pvr_sub_cmd_event *sub_cmd; |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT); |
| if (result != VK_SUCCESS) |
| return; |
| |
| for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++) |
| stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask; |
| |
| for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++) |
| stage_mask |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask; |
| |
| for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) |
| stage_mask |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask; |
| |
| sub_cmd = &cmd_buffer->state.current_sub_cmd->event; |
| |
| sub_cmd->type = PVR_EVENT_TYPE_SET; |
| sub_cmd->set.event = event; |
| sub_cmd->set.wait_for_stage_mask = pvr_stage_mask_dst(stage_mask); |
| |
| pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| } |
| |
| void pvr_CmdWaitEvents2(VkCommandBuffer commandBuffer, |
| uint32_t eventCount, |
| const VkEvent *pEvents, |
| const VkDependencyInfo *pDependencyInfos) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_sub_cmd_event *sub_cmd; |
| struct pvr_event **events_array; |
| uint32_t *stage_masks; |
| VkResult result; |
| |
| PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); |
| |
| VK_MULTIALLOC(ma); |
| vk_multialloc_add(&ma, &events_array, __typeof__(*events_array), eventCount); |
| vk_multialloc_add(&ma, &stage_masks, __typeof__(*stage_masks), eventCount); |
| |
| if (!vk_multialloc_alloc(&ma, |
| &cmd_buffer->vk.pool->alloc, |
| VK_SYSTEM_ALLOCATION_SCOPE_OBJECT)) { |
| cmd_buffer->state.status = |
| vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY); |
| return; |
| } |
| |
| result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT); |
| if (result != VK_SUCCESS) { |
| vk_free(&cmd_buffer->vk.pool->alloc, events_array); |
| return; |
| } |
| |
| memcpy(events_array, pEvents, sizeof(*events_array) * eventCount); |
| |
| for (uint32_t i = 0; i < eventCount; i++) { |
| const VkDependencyInfo *info = &pDependencyInfos[i]; |
| VkPipelineStageFlags2 mask = 0; |
| |
| for (uint32_t j = 0; j < info->memoryBarrierCount; j++) |
| mask |= info->pMemoryBarriers[j].dstStageMask; |
| |
| for (uint32_t j = 0; j < info->bufferMemoryBarrierCount; j++) |
| mask |= info->pBufferMemoryBarriers[j].dstStageMask; |
| |
| for (uint32_t j = 0; j < info->imageMemoryBarrierCount; j++) |
| mask |= info->pImageMemoryBarriers[j].dstStageMask; |
| |
| stage_masks[i] = pvr_stage_mask_dst(mask); |
| } |
| |
| sub_cmd = &cmd_buffer->state.current_sub_cmd->event; |
| |
| sub_cmd->type = PVR_EVENT_TYPE_WAIT; |
| sub_cmd->wait.count = eventCount; |
| sub_cmd->wait.events = events_array; |
| sub_cmd->wait.wait_at_stage_masks = stage_masks; |
| |
| pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| } |
| |
| void pvr_CmdWriteTimestamp2KHR(VkCommandBuffer commandBuffer, |
| VkPipelineStageFlags2 stage, |
| VkQueryPool queryPool, |
| uint32_t query) |
| { |
| unreachable("Timestamp queries are not supported."); |
| } |
| |
| VkResult pvr_EndCommandBuffer(VkCommandBuffer commandBuffer) |
| { |
| PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); |
| struct pvr_cmd_buffer_state *state = &cmd_buffer->state; |
| VkResult result; |
| |
| /* From the Vulkan 1.0 spec: |
| * |
| * CommandBuffer must be in the recording state. |
| */ |
| assert(cmd_buffer->status == PVR_CMD_BUFFER_STATUS_RECORDING); |
| |
| if (state->status != VK_SUCCESS) |
| return state->status; |
| |
| result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| cmd_buffer->status = PVR_CMD_BUFFER_STATUS_EXECUTABLE; |
| |
| return VK_SUCCESS; |
| } |