| /* |
| * Copyright © 2022 Imagination Technologies Ltd. |
| * |
| * based in part on v3dv driver which is: |
| * Copyright © 2019 Raspberry Pi |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy |
| * of this software and associated documentation files (the "Software"), to deal |
| * in the Software without restriction, including without limitation the rights |
| * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| * copies of the Software, and to permit persons to whom the Software is |
| * furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| */ |
| |
| #include <assert.h> |
| #include <stdbool.h> |
| #include <stdint.h> |
| #include <string.h> |
| #include <vulkan/vulkan.h> |
| |
| #include "compiler/shader_enums.h" |
| #include "hwdef/rogue_hw_utils.h" |
| #include "nir/nir.h" |
| #include "pco/pco.h" |
| #include "pco/pco_data.h" |
| #include "pvr_bo.h" |
| #include "pvr_csb.h" |
| #include "pvr_csb_enum_helpers.h" |
| #include "pvr_hardcode.h" |
| #include "pvr_pds.h" |
| #include "pvr_private.h" |
| #include "pvr_robustness.h" |
| #include "pvr_shader.h" |
| #include "pvr_types.h" |
| #include "rogue/rogue.h" |
| #include "util/log.h" |
| #include "util/macros.h" |
| #include "util/ralloc.h" |
| #include "util/u_dynarray.h" |
| #include "util/u_math.h" |
| #include "vk_alloc.h" |
| #include "vk_format.h" |
| #include "vk_graphics_state.h" |
| #include "vk_log.h" |
| #include "vk_object.h" |
| #include "vk_pipeline_cache.h" |
| #include "vk_pipeline_layout.h" |
| #include "vk_render_pass.h" |
| #include "vk_util.h" |
| #include "vulkan/runtime/vk_pipeline.h" |
| |
| /***************************************************************************** |
| PDS functions |
| *****************************************************************************/ |
| |
| /* If allocator == NULL, the internal one will be used. */ |
| static VkResult pvr_pds_coeff_program_create_and_upload( |
| struct pvr_device *device, |
| const VkAllocationCallbacks *allocator, |
| struct pvr_pds_coeff_loading_program *program, |
| struct pvr_fragment_shader_state *fragment_state) |
| { |
| uint32_t staging_buffer_size; |
| uint32_t *staging_buffer; |
| VkResult result; |
| |
| assert(program->num_fpu_iterators < PVR_MAXIMUM_ITERATIONS); |
| |
| /* Get the size of the program and then allocate that much memory. */ |
| pvr_pds_coefficient_loading(program, NULL, PDS_GENERATE_SIZES); |
| |
| if (!program->code_size) { |
| fragment_state->pds_coeff_program.pvr_bo = NULL; |
| fragment_state->pds_coeff_program.code_size = 0; |
| fragment_state->pds_coeff_program.data_size = 0; |
| fragment_state->stage_state.pds_temps_count = 0; |
| |
| return VK_SUCCESS; |
| } |
| |
| staging_buffer_size = |
| PVR_DW_TO_BYTES(program->code_size + program->data_size); |
| |
| staging_buffer = vk_alloc2(&device->vk.alloc, |
| allocator, |
| staging_buffer_size, |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); |
| if (!staging_buffer) |
| return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| |
| /* Generate the program into is the staging_buffer. */ |
| pvr_pds_coefficient_loading(program, |
| staging_buffer, |
| PDS_GENERATE_CODEDATA_SEGMENTS); |
| |
| /* FIXME: Figure out the define for alignment of 16. */ |
| result = pvr_gpu_upload_pds(device, |
| &staging_buffer[0], |
| program->data_size, |
| 16, |
| &staging_buffer[program->data_size], |
| program->code_size, |
| 16, |
| 16, |
| &fragment_state->pds_coeff_program); |
| if (result != VK_SUCCESS) { |
| vk_free2(&device->vk.alloc, allocator, staging_buffer); |
| return result; |
| } |
| |
| vk_free2(&device->vk.alloc, allocator, staging_buffer); |
| |
| fragment_state->stage_state.pds_temps_count = program->temps_used; |
| |
| return VK_SUCCESS; |
| } |
| |
| /* FIXME: move this elsewhere since it's also called in pvr_pass.c? */ |
| /* If allocator == NULL, the internal one will be used. */ |
| VkResult pvr_pds_fragment_program_create_and_upload( |
| struct pvr_device *device, |
| const VkAllocationCallbacks *allocator, |
| pco_shader *fs, |
| struct pvr_fragment_shader_state *fragment_state) |
| { |
| /* TODO: remove the below + revert the pvr_pds_setup_doutu |
| * args and make sure fs isn't NULL instead; |
| * temporarily in place for hardcoded load ops in |
| * pvr_pass.c:pvr_generate_load_op_shader() |
| */ |
| unsigned temps = 0; |
| bool has_phase_rate_change = false; |
| unsigned entry_offset = 0; |
| |
| if (fs) { |
| pco_data *fs_data = pco_shader_data(fs); |
| temps = fs_data->common.temps; |
| has_phase_rate_change = fs_data->fs.uses.phase_change; |
| entry_offset = fs_data->common.entry_offset; |
| } |
| |
| struct pvr_pds_kickusc_program program = { 0 }; |
| uint32_t staging_buffer_size; |
| uint32_t *staging_buffer; |
| VkResult result; |
| |
| const pvr_dev_addr_t exec_addr = |
| PVR_DEV_ADDR_OFFSET(fragment_state->bo->dev_addr, |
| /* fs_data->common.entry_offset */ entry_offset); |
| |
| /* Note this is not strictly required to be done before calculating the |
| * staging_buffer_size in this particular case. It can also be done after |
| * allocating the buffer. The size from pvr_pds_kick_usc() is constant. |
| */ |
| pvr_pds_setup_doutu( |
| &program.usc_task_control, |
| exec_addr.addr, |
| /* fs_data->common.temps */ temps, |
| fragment_state->sample_rate, |
| /* fs_data->fs.uses.phase_change */ has_phase_rate_change); |
| |
| pvr_pds_kick_usc(&program, NULL, 0, false, PDS_GENERATE_SIZES); |
| |
| staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size); |
| |
| staging_buffer = vk_alloc2(&device->vk.alloc, |
| allocator, |
| staging_buffer_size, |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); |
| if (!staging_buffer) |
| return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| |
| pvr_pds_kick_usc(&program, |
| staging_buffer, |
| 0, |
| false, |
| PDS_GENERATE_CODEDATA_SEGMENTS); |
| |
| /* FIXME: Figure out the define for alignment of 16. */ |
| result = pvr_gpu_upload_pds(device, |
| &staging_buffer[0], |
| program.data_size, |
| 16, |
| &staging_buffer[program.data_size], |
| program.code_size, |
| 16, |
| 16, |
| &fragment_state->pds_fragment_program); |
| if (result != VK_SUCCESS) { |
| vk_free2(&device->vk.alloc, allocator, staging_buffer); |
| return result; |
| } |
| |
| vk_free2(&device->vk.alloc, allocator, staging_buffer); |
| |
| return VK_SUCCESS; |
| } |
| |
| static inline size_t pvr_pds_get_max_vertex_program_const_map_size_in_bytes( |
| const struct pvr_device_info *dev_info, |
| bool robust_buffer_access) |
| { |
| /* FIXME: Use more local variable to improve formatting. */ |
| |
| /* Maximum memory allocation needed for const map entries in |
| * pvr_pds_generate_vertex_primary_program(). |
| * When robustBufferAccess is disabled, it must be >= 410. |
| * When robustBufferAccess is enabled, it must be >= 570. |
| * |
| * 1. Size of entry for base instance |
| * (pvr_const_map_entry_base_instance) |
| * |
| * 2. Max. number of vertex inputs (PVR_MAX_VERTEX_INPUT_BINDINGS) * ( |
| * if (!robustBufferAccess) |
| * size of vertex attribute entry |
| * (pvr_const_map_entry_vertex_attribute_address) + |
| * else |
| * size of robust vertex attribute entry |
| * (pvr_const_map_entry_robust_vertex_attribute_address) + |
| * size of entry for max attribute index |
| * (pvr_const_map_entry_vertex_attribute_max_index) + |
| * fi |
| * size of Unified Store burst entry |
| * (pvr_const_map_entry_literal32) + |
| * size of entry for vertex stride |
| * (pvr_const_map_entry_literal32) + |
| * size of entries for DDMAD control word |
| * (num_ddmad_literals * pvr_const_map_entry_literal32)) |
| * |
| * 3. Size of entry for DOUTW vertex/instance control word |
| * (pvr_const_map_entry_literal32) |
| * |
| * 4. Size of DOUTU entry (pvr_const_map_entry_doutu_address) |
| */ |
| |
| const size_t attribute_size = |
| (!robust_buffer_access) |
| ? sizeof(struct pvr_const_map_entry_vertex_attribute_address) |
| : sizeof(struct pvr_const_map_entry_robust_vertex_attribute_address) + |
| sizeof(struct pvr_const_map_entry_vertex_attribute_max_index); |
| |
| /* If has_pds_ddmadt the DDMAD control word is now a DDMADT control word |
| * and is increased by one DWORD to contain the data for the DDMADT's |
| * out-of-bounds check. |
| */ |
| const size_t pvr_pds_const_map_vertex_entry_num_ddmad_literals = |
| 1U + (size_t)PVR_HAS_FEATURE(dev_info, pds_ddmadt); |
| |
| return (sizeof(struct pvr_const_map_entry_base_instance) + |
| PVR_MAX_VERTEX_INPUT_BINDINGS * |
| (attribute_size + |
| (2 + pvr_pds_const_map_vertex_entry_num_ddmad_literals) * |
| sizeof(struct pvr_const_map_entry_literal32)) + |
| sizeof(struct pvr_const_map_entry_literal32) + |
| sizeof(struct pvr_const_map_entry_doutu_address)); |
| } |
| |
| static VkResult pvr_pds_vertex_attrib_program_create_and_upload( |
| struct pvr_device *const device, |
| const VkAllocationCallbacks *const allocator, |
| struct pvr_pds_vertex_primary_program_input *const input, |
| struct pvr_pds_attrib_program *const program_out) |
| { |
| const size_t const_entries_size_in_bytes = |
| pvr_pds_get_max_vertex_program_const_map_size_in_bytes( |
| &device->pdevice->dev_info, |
| device->vk.enabled_features.robustBufferAccess); |
| struct pvr_pds_upload *const program = &program_out->program; |
| struct pvr_pds_info *const info = &program_out->info; |
| struct pvr_const_map_entry *new_entries; |
| ASSERTED uint32_t code_size_in_dwords; |
| size_t staging_buffer_size; |
| uint32_t *staging_buffer; |
| VkResult result; |
| |
| memset(info, 0, sizeof(*info)); |
| |
| info->entries = vk_alloc2(&device->vk.alloc, |
| allocator, |
| const_entries_size_in_bytes, |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); |
| if (!info->entries) { |
| result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| goto err_out; |
| } |
| |
| info->entries_size_in_bytes = const_entries_size_in_bytes; |
| |
| pvr_pds_generate_vertex_primary_program( |
| input, |
| NULL, |
| info, |
| device->vk.enabled_features.robustBufferAccess, |
| &device->pdevice->dev_info); |
| |
| code_size_in_dwords = info->code_size_in_dwords; |
| staging_buffer_size = PVR_DW_TO_BYTES(info->code_size_in_dwords); |
| |
| staging_buffer = vk_alloc2(&device->vk.alloc, |
| allocator, |
| staging_buffer_size, |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); |
| if (!staging_buffer) { |
| result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| goto err_free_entries; |
| } |
| |
| /* This also fills in info->entries. */ |
| pvr_pds_generate_vertex_primary_program( |
| input, |
| staging_buffer, |
| info, |
| device->vk.enabled_features.robustBufferAccess, |
| &device->pdevice->dev_info); |
| |
| assert(info->code_size_in_dwords <= code_size_in_dwords); |
| |
| /* FIXME: Add a vk_realloc2() ? */ |
| new_entries = vk_realloc((!allocator) ? &device->vk.alloc : allocator, |
| info->entries, |
| info->entries_written_size_in_bytes, |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); |
| if (!new_entries) { |
| result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| goto err_free_staging_buffer; |
| } |
| |
| info->entries = new_entries; |
| info->entries_size_in_bytes = info->entries_written_size_in_bytes; |
| |
| /* FIXME: Figure out the define for alignment of 16. */ |
| result = pvr_gpu_upload_pds(device, |
| NULL, |
| 0, |
| 0, |
| staging_buffer, |
| info->code_size_in_dwords, |
| 16, |
| 16, |
| program); |
| if (result != VK_SUCCESS) |
| goto err_free_staging_buffer; |
| |
| vk_free2(&device->vk.alloc, allocator, staging_buffer); |
| |
| return VK_SUCCESS; |
| |
| err_free_staging_buffer: |
| vk_free2(&device->vk.alloc, allocator, staging_buffer); |
| |
| err_free_entries: |
| vk_free2(&device->vk.alloc, allocator, info->entries); |
| |
| err_out: |
| return result; |
| } |
| |
| static inline void pvr_pds_vertex_attrib_program_destroy( |
| struct pvr_device *const device, |
| const struct VkAllocationCallbacks *const allocator, |
| struct pvr_pds_attrib_program *const program) |
| { |
| pvr_bo_suballoc_free(program->program.pvr_bo); |
| vk_free2(&device->vk.alloc, allocator, program->info.entries); |
| } |
| |
| /* This is a const pointer to an array of pvr_pds_attrib_program structs. |
| * The array being pointed to is of PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT size. |
| */ |
| typedef struct pvr_pds_attrib_program (*const pvr_pds_attrib_programs_array_ptr) |
| [PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT]; |
| |
| /* Generate and uploads a PDS program for DMAing vertex attribs into USC vertex |
| * inputs. This will bake the code segment and create a template of the data |
| * segment for the command buffer to fill in. |
| */ |
| /* If allocator == NULL, the internal one will be used. |
| * |
| * programs_out_ptr is a pointer to the array where the outputs will be placed. |
| */ |
| static VkResult pvr_pds_vertex_attrib_programs_create_and_upload( |
| struct pvr_device *device, |
| const VkAllocationCallbacks *const allocator, |
| pco_data *shader_data, |
| const struct pvr_pds_vertex_dma |
| dma_descriptions[static const PVR_MAX_VERTEX_ATTRIB_DMAS], |
| uint32_t dma_count, |
| pvr_pds_attrib_programs_array_ptr programs_out_ptr) |
| { |
| struct pvr_pds_vertex_primary_program_input input = { |
| .dma_list = dma_descriptions, |
| .dma_count = dma_count, |
| }; |
| uint32_t usc_temp_count = shader_data->common.temps; |
| struct pvr_pds_attrib_program *const programs_out = *programs_out_ptr; |
| VkResult result; |
| |
| pco_range *sys_vals = shader_data->common.sys_vals; |
| if (sys_vals[SYSTEM_VALUE_VERTEX_ID].count > 0) { |
| input.flags |= PVR_PDS_VERTEX_FLAGS_VERTEX_ID_REQUIRED; |
| input.vertex_id_register = sys_vals[SYSTEM_VALUE_VERTEX_ID].start; |
| } |
| |
| if (sys_vals[SYSTEM_VALUE_INSTANCE_ID].count > 0) { |
| input.flags |= PVR_PDS_VERTEX_FLAGS_INSTANCE_ID_REQUIRED; |
| input.instance_id_register = sys_vals[SYSTEM_VALUE_INSTANCE_ID].start; |
| } |
| |
| if (sys_vals[SYSTEM_VALUE_BASE_INSTANCE].count > 0) { |
| input.flags |= PVR_PDS_VERTEX_FLAGS_BASE_INSTANCE_REQUIRED; |
| input.base_instance_register = sys_vals[SYSTEM_VALUE_BASE_INSTANCE].start; |
| } |
| |
| if (sys_vals[SYSTEM_VALUE_BASE_VERTEX].count > 0) { |
| input.flags |= PVR_PDS_VERTEX_FLAGS_BASE_VERTEX_REQUIRED; |
| input.base_vertex_register = sys_vals[SYSTEM_VALUE_BASE_VERTEX].start; |
| } |
| |
| if (sys_vals[SYSTEM_VALUE_DRAW_ID].count > 0) { |
| input.flags |= PVR_PDS_VERTEX_FLAGS_DRAW_INDEX_REQUIRED; |
| input.draw_index_register = sys_vals[SYSTEM_VALUE_DRAW_ID].start; |
| } |
| |
| pvr_pds_setup_doutu(&input.usc_task_control, |
| 0, |
| usc_temp_count, |
| ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE, |
| false); |
| |
| /* Note: programs_out_ptr is a pointer to an array so this is fine. See the |
| * typedef. |
| */ |
| for (uint32_t i = 0; i < ARRAY_SIZE(*programs_out_ptr); i++) { |
| uint32_t extra_flags; |
| |
| switch (i) { |
| case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC: |
| extra_flags = 0; |
| break; |
| |
| case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE: |
| extra_flags = PVR_PDS_VERTEX_FLAGS_BASE_INSTANCE_VARIANT; |
| break; |
| |
| case PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT: |
| extra_flags = PVR_PDS_VERTEX_FLAGS_DRAW_INDIRECT_VARIANT; |
| break; |
| |
| default: |
| unreachable("Invalid vertex attrib program type."); |
| } |
| |
| input.flags |= extra_flags; |
| |
| result = |
| pvr_pds_vertex_attrib_program_create_and_upload(device, |
| allocator, |
| &input, |
| &programs_out[i]); |
| if (result != VK_SUCCESS) { |
| for (uint32_t j = 0; j < i; j++) { |
| pvr_pds_vertex_attrib_program_destroy(device, |
| allocator, |
| &programs_out[j]); |
| } |
| |
| return result; |
| } |
| |
| input.flags &= ~extra_flags; |
| } |
| |
| return VK_SUCCESS; |
| } |
| |
| size_t pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes(void) |
| { |
| /* Maximum memory allocation needed for const map entries in |
| * pvr_pds_generate_descriptor_upload_program(). |
| * It must be >= 688 bytes. This size is calculated as the sum of: |
| * |
| * 1. Max. number of descriptor sets (8) * ( |
| * size of descriptor entry |
| * (pvr_const_map_entry_descriptor_set) + |
| * size of Common Store burst entry |
| * (pvr_const_map_entry_literal32)) |
| * |
| * 2. Max. number of PDS program buffers (24) * ( |
| * size of the largest buffer structure |
| * (pvr_const_map_entry_constant_buffer) + |
| * size of Common Store burst entry |
| * (pvr_const_map_entry_literal32) |
| * |
| * 3. Size of DOUTU entry (pvr_const_map_entry_doutu_address) |
| */ |
| |
| /* FIXME: PVR_MAX_DESCRIPTOR_SETS is 4 and not 8. The comment above seems to |
| * say that it should be 8. |
| * Figure our a define for this or is the comment wrong? |
| */ |
| return (8 * (sizeof(struct pvr_const_map_entry_descriptor_set) + |
| sizeof(struct pvr_const_map_entry_literal32)) + |
| PVR_PDS_MAX_BUFFERS * |
| (sizeof(struct pvr_const_map_entry_constant_buffer) + |
| sizeof(struct pvr_const_map_entry_literal32)) + |
| sizeof(struct pvr_const_map_entry_doutu_address)); |
| } |
| |
| static VkResult pvr_pds_descriptor_program_create_and_upload( |
| struct pvr_device *const device, |
| const VkAllocationCallbacks *const allocator, |
| const struct vk_pipeline_layout *const layout, |
| gl_shader_stage stage, |
| pco_data *data, |
| struct pvr_stage_allocation_descriptor_state *const descriptor_state) |
| { |
| const size_t const_entries_size_in_bytes = |
| pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes(); |
| struct pvr_pds_info *const pds_info = &descriptor_state->pds_info; |
| struct pvr_pds_descriptor_program_input program = { 0 }; |
| struct pvr_const_map_entry *new_entries; |
| ASSERTED uint32_t code_size_in_dwords; |
| uint32_t staging_buffer_size; |
| uint32_t *staging_buffer; |
| VkResult result; |
| |
| *pds_info = (struct pvr_pds_info){ 0 }; |
| |
| for (unsigned desc_set = 0; desc_set < layout->set_count; ++desc_set) { |
| const struct pvr_descriptor_set_layout *set_layout = |
| vk_to_pvr_descriptor_set_layout(layout->set_layouts[desc_set]); |
| |
| const pco_descriptor_set_data *desc_set_data = |
| &data->common.desc_sets[desc_set]; |
| const pco_range *desc_set_range = &desc_set_data->range; |
| |
| /* If the descriptor set isn't for this stage or is unused, skip it. */ |
| if (!(BITFIELD_BIT(stage) & set_layout->stage_flags)) { |
| assert(!desc_set_data->used); |
| continue; |
| } |
| |
| if (!desc_set_data->used) |
| continue; |
| |
| program.descriptor_sets[program.descriptor_set_count] = |
| (struct pvr_pds_descriptor_set){ |
| .descriptor_set = desc_set, |
| .size_in_dwords = desc_set_range->count, |
| .destination = desc_set_range->start, |
| }; |
| |
| program.descriptor_set_count++; |
| } |
| |
| pds_info->entries = vk_alloc2(&device->vk.alloc, |
| allocator, |
| const_entries_size_in_bytes, |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); |
| if (!pds_info->entries) { |
| result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| goto err_free_static_consts; |
| } |
| |
| pds_info->entries_size_in_bytes = const_entries_size_in_bytes; |
| |
| pvr_pds_generate_descriptor_upload_program(&program, NULL, pds_info); |
| |
| code_size_in_dwords = pds_info->code_size_in_dwords; |
| staging_buffer_size = PVR_DW_TO_BYTES(pds_info->code_size_in_dwords); |
| |
| if (!staging_buffer_size) { |
| vk_free2(&device->vk.alloc, allocator, pds_info->entries); |
| |
| *descriptor_state = (struct pvr_stage_allocation_descriptor_state){ 0 }; |
| |
| return VK_SUCCESS; |
| } |
| |
| staging_buffer = vk_alloc2(&device->vk.alloc, |
| allocator, |
| staging_buffer_size, |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); |
| if (!staging_buffer) { |
| result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| goto err_free_entries; |
| } |
| |
| pvr_pds_generate_descriptor_upload_program(&program, |
| staging_buffer, |
| pds_info); |
| |
| assert(pds_info->code_size_in_dwords <= code_size_in_dwords); |
| |
| /* FIXME: use vk_realloc2() ? */ |
| new_entries = vk_realloc((!allocator) ? &device->vk.alloc : allocator, |
| pds_info->entries, |
| pds_info->entries_written_size_in_bytes, |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); |
| if (!new_entries) { |
| result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| goto err_free_staging_buffer; |
| } |
| |
| pds_info->entries = new_entries; |
| pds_info->entries_size_in_bytes = pds_info->entries_written_size_in_bytes; |
| |
| /* FIXME: Figure out the define for alignment of 16. */ |
| result = pvr_gpu_upload_pds(device, |
| NULL, |
| 0, |
| 0, |
| staging_buffer, |
| pds_info->code_size_in_dwords, |
| 16, |
| 16, |
| &descriptor_state->pds_code); |
| if (result != VK_SUCCESS) |
| goto err_free_staging_buffer; |
| |
| vk_free2(&device->vk.alloc, allocator, staging_buffer); |
| |
| return VK_SUCCESS; |
| |
| err_free_staging_buffer: |
| vk_free2(&device->vk.alloc, allocator, staging_buffer); |
| |
| err_free_entries: |
| vk_free2(&device->vk.alloc, allocator, pds_info->entries); |
| |
| err_free_static_consts: |
| pvr_bo_suballoc_free(descriptor_state->static_consts); |
| |
| return result; |
| } |
| |
| static void pvr_pds_descriptor_program_destroy( |
| struct pvr_device *const device, |
| const struct VkAllocationCallbacks *const allocator, |
| struct pvr_stage_allocation_descriptor_state *const descriptor_state) |
| { |
| if (!descriptor_state) |
| return; |
| |
| pvr_bo_suballoc_free(descriptor_state->pds_code.pvr_bo); |
| vk_free2(&device->vk.alloc, allocator, descriptor_state->pds_info.entries); |
| pvr_bo_suballoc_free(descriptor_state->static_consts); |
| } |
| |
| static void pvr_pds_compute_program_setup( |
| const struct pvr_device_info *dev_info, |
| const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS], |
| const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS], |
| uint32_t barrier_coefficient, |
| bool add_base_workgroup, |
| uint32_t usc_temps, |
| pvr_dev_addr_t usc_shader_dev_addr, |
| struct pvr_pds_compute_shader_program *const program) |
| { |
| pvr_pds_compute_shader_program_init(program); |
| program->local_input_regs[0] = local_input_regs[0]; |
| program->local_input_regs[1] = local_input_regs[1]; |
| program->local_input_regs[2] = local_input_regs[2]; |
| program->work_group_input_regs[0] = work_group_input_regs[0]; |
| program->work_group_input_regs[1] = work_group_input_regs[1]; |
| program->work_group_input_regs[2] = work_group_input_regs[2]; |
| program->barrier_coefficient = barrier_coefficient; |
| program->add_base_workgroup = add_base_workgroup; |
| program->flattened_work_groups = true; |
| program->kick_usc = true; |
| |
| STATIC_ASSERT(ARRAY_SIZE(program->local_input_regs) == |
| PVR_WORKGROUP_DIMENSIONS); |
| STATIC_ASSERT(ARRAY_SIZE(program->work_group_input_regs) == |
| PVR_WORKGROUP_DIMENSIONS); |
| STATIC_ASSERT(ARRAY_SIZE(program->global_input_regs) == |
| PVR_WORKGROUP_DIMENSIONS); |
| |
| pvr_pds_setup_doutu(&program->usc_task_control, |
| usc_shader_dev_addr.addr, |
| usc_temps, |
| ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE, |
| false); |
| |
| pvr_pds_compute_shader(program, NULL, PDS_GENERATE_SIZES, dev_info); |
| } |
| |
| /* FIXME: See if pvr_device_init_compute_pds_program() and this could be merged. |
| */ |
| static VkResult pvr_pds_compute_program_create_and_upload( |
| struct pvr_device *const device, |
| const VkAllocationCallbacks *const allocator, |
| const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS], |
| const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS], |
| uint32_t barrier_coefficient, |
| uint32_t usc_temps, |
| pvr_dev_addr_t usc_shader_dev_addr, |
| struct pvr_pds_upload *const pds_upload_out, |
| struct pvr_pds_info *const pds_info_out) |
| { |
| struct pvr_device_info *dev_info = &device->pdevice->dev_info; |
| struct pvr_pds_compute_shader_program program; |
| uint32_t staging_buffer_size; |
| uint32_t *staging_buffer; |
| VkResult result; |
| |
| pvr_pds_compute_program_setup(dev_info, |
| local_input_regs, |
| work_group_input_regs, |
| barrier_coefficient, |
| false, |
| usc_temps, |
| usc_shader_dev_addr, |
| &program); |
| |
| /* FIXME: According to pvr_device_init_compute_pds_program() the code size |
| * is in bytes. Investigate this. |
| */ |
| staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size); |
| |
| staging_buffer = vk_alloc2(&device->vk.alloc, |
| allocator, |
| staging_buffer_size, |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); |
| if (!staging_buffer) |
| return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| |
| /* FIXME: pvr_pds_compute_shader doesn't implement |
| * PDS_GENERATE_CODEDATA_SEGMENTS. |
| */ |
| pvr_pds_compute_shader(&program, |
| &staging_buffer[0], |
| PDS_GENERATE_CODE_SEGMENT, |
| dev_info); |
| |
| pvr_pds_compute_shader(&program, |
| &staging_buffer[program.code_size], |
| PDS_GENERATE_DATA_SEGMENT, |
| dev_info); |
| |
| /* FIXME: Figure out the define for alignment of 16. */ |
| result = pvr_gpu_upload_pds(device, |
| &staging_buffer[program.code_size], |
| program.data_size, |
| 16, |
| &staging_buffer[0], |
| program.code_size, |
| 16, |
| 16, |
| pds_upload_out); |
| if (result != VK_SUCCESS) { |
| vk_free2(&device->vk.alloc, allocator, staging_buffer); |
| return result; |
| } |
| |
| *pds_info_out = (struct pvr_pds_info){ |
| .temps_required = program.highest_temp, |
| .code_size_in_dwords = program.code_size, |
| .data_size_in_dwords = program.data_size, |
| }; |
| |
| vk_free2(&device->vk.alloc, allocator, staging_buffer); |
| |
| return VK_SUCCESS; |
| }; |
| |
| static void pvr_pds_compute_program_destroy( |
| struct pvr_device *const device, |
| const struct VkAllocationCallbacks *const allocator, |
| struct pvr_pds_upload *const pds_program, |
| struct pvr_pds_info *const pds_info) |
| { |
| /* We don't allocate an entries buffer so we don't need to free it */ |
| pvr_bo_suballoc_free(pds_program->pvr_bo); |
| } |
| |
| /* This only uploads the code segment. The data segment will need to be patched |
| * with the base workgroup before uploading. |
| */ |
| static VkResult pvr_pds_compute_base_workgroup_variant_program_init( |
| struct pvr_device *const device, |
| const VkAllocationCallbacks *const allocator, |
| const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS], |
| const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS], |
| uint32_t barrier_coefficient, |
| uint32_t usc_temps, |
| pvr_dev_addr_t usc_shader_dev_addr, |
| struct pvr_pds_base_workgroup_program *program_out) |
| { |
| struct pvr_device_info *dev_info = &device->pdevice->dev_info; |
| struct pvr_pds_compute_shader_program program; |
| uint32_t buffer_size; |
| uint32_t *buffer; |
| VkResult result; |
| |
| pvr_pds_compute_program_setup(dev_info, |
| local_input_regs, |
| work_group_input_regs, |
| barrier_coefficient, |
| true, |
| usc_temps, |
| usc_shader_dev_addr, |
| &program); |
| |
| /* FIXME: According to pvr_device_init_compute_pds_program() the code size |
| * is in bytes. Investigate this. |
| */ |
| buffer_size = PVR_DW_TO_BYTES(MAX2(program.code_size, program.data_size)); |
| |
| buffer = vk_alloc2(&device->vk.alloc, |
| allocator, |
| buffer_size, |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); |
| if (!buffer) |
| return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| |
| pvr_pds_compute_shader(&program, |
| &buffer[0], |
| PDS_GENERATE_CODE_SEGMENT, |
| dev_info); |
| |
| /* FIXME: Figure out the define for alignment of 16. */ |
| result = pvr_gpu_upload_pds(device, |
| NULL, |
| 0, |
| 0, |
| buffer, |
| program.code_size, |
| 16, |
| 16, |
| &program_out->code_upload); |
| if (result != VK_SUCCESS) { |
| vk_free2(&device->vk.alloc, allocator, buffer); |
| return result; |
| } |
| |
| pvr_pds_compute_shader(&program, buffer, PDS_GENERATE_DATA_SEGMENT, dev_info); |
| |
| program_out->data_section = buffer; |
| |
| /* We'll need to patch the base workgroup in the PDS data section before |
| * dispatch so we save the offsets at which to patch. We only need to save |
| * the offset for the first workgroup id since the workgroup ids are stored |
| * contiguously in the data segment. |
| */ |
| program_out->base_workgroup_data_patching_offset = |
| program.base_workgroup_constant_offset_in_dwords[0]; |
| |
| program_out->info = (struct pvr_pds_info){ |
| .temps_required = program.highest_temp, |
| .code_size_in_dwords = program.code_size, |
| .data_size_in_dwords = program.data_size, |
| }; |
| |
| return VK_SUCCESS; |
| } |
| |
| static void pvr_pds_compute_base_workgroup_variant_program_finish( |
| struct pvr_device *device, |
| const VkAllocationCallbacks *const allocator, |
| struct pvr_pds_base_workgroup_program *const state) |
| { |
| pvr_bo_suballoc_free(state->code_upload.pvr_bo); |
| vk_free2(&device->vk.alloc, allocator, state->data_section); |
| } |
| |
| /****************************************************************************** |
| Generic pipeline functions |
| ******************************************************************************/ |
| |
| static void pvr_pipeline_init(struct pvr_device *device, |
| enum pvr_pipeline_type type, |
| const VkPipelineLayout layout, |
| struct pvr_pipeline *const pipeline) |
| { |
| vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE); |
| |
| pipeline->type = type; |
| |
| assert(!pipeline->layout); |
| pipeline->layout = vk_pipeline_layout_from_handle(layout); |
| vk_pipeline_layout_ref(pipeline->layout); |
| } |
| |
| static void pvr_pipeline_finish(struct pvr_device *device, |
| struct pvr_pipeline *pipeline) |
| { |
| vk_pipeline_layout_unref(&device->vk, pipeline->layout); |
| vk_object_base_finish(&pipeline->base); |
| } |
| |
| /* How many shared regs it takes to store a pvr_dev_addr_t. |
| * Each shared reg is 32 bits. |
| */ |
| #define PVR_DEV_ADDR_SIZE_IN_SH_REGS \ |
| DIV_ROUND_UP(sizeof(pvr_dev_addr_t), sizeof(uint32_t)) |
| |
| /****************************************************************************** |
| Compute pipeline functions |
| ******************************************************************************/ |
| |
| /* Compiles and uploads shaders and PDS programs. */ |
| static VkResult pvr_compute_pipeline_compile( |
| struct pvr_device *const device, |
| struct vk_pipeline_cache *cache, |
| const VkComputePipelineCreateInfo *pCreateInfo, |
| const VkAllocationCallbacks *const allocator, |
| struct pvr_compute_pipeline *const compute_pipeline) |
| { |
| struct vk_pipeline_layout *layout = compute_pipeline->base.layout; |
| uint32_t work_group_input_regs[PVR_WORKGROUP_DIMENSIONS]; |
| uint32_t local_input_regs[PVR_WORKGROUP_DIMENSIONS]; |
| uint32_t barrier_coefficient; |
| uint32_t usc_temps; |
| VkResult result; |
| |
| compute_pipeline->shader_state.const_shared_reg_count = 0; |
| |
| /* FIXME: Compile and upload the shader. */ |
| /* FIXME: Initialize the shader state and setup build info. */ |
| unreachable("finishme: compute support"); |
| |
| result = pvr_pds_descriptor_program_create_and_upload( |
| device, |
| allocator, |
| layout, |
| MESA_SHADER_COMPUTE, |
| NULL, |
| &compute_pipeline->descriptor_state); |
| if (result != VK_SUCCESS) |
| goto err_free_shader; |
| |
| result = pvr_pds_compute_program_create_and_upload( |
| device, |
| allocator, |
| local_input_regs, |
| work_group_input_regs, |
| barrier_coefficient, |
| usc_temps, |
| compute_pipeline->shader_state.bo->dev_addr, |
| &compute_pipeline->primary_program, |
| &compute_pipeline->primary_program_info); |
| if (result != VK_SUCCESS) |
| goto err_free_descriptor_program; |
| |
| /* If the workgroup ID is required, then we require the base workgroup |
| * variant of the PDS compute program as well. |
| */ |
| compute_pipeline->flags.base_workgroup = |
| work_group_input_regs[0] != PVR_PDS_REG_UNUSED || |
| work_group_input_regs[1] != PVR_PDS_REG_UNUSED || |
| work_group_input_regs[2] != PVR_PDS_REG_UNUSED; |
| |
| if (compute_pipeline->flags.base_workgroup) { |
| result = pvr_pds_compute_base_workgroup_variant_program_init( |
| device, |
| allocator, |
| local_input_regs, |
| work_group_input_regs, |
| barrier_coefficient, |
| usc_temps, |
| compute_pipeline->shader_state.bo->dev_addr, |
| &compute_pipeline->primary_base_workgroup_variant_program); |
| if (result != VK_SUCCESS) |
| goto err_destroy_compute_program; |
| } |
| |
| return VK_SUCCESS; |
| |
| err_destroy_compute_program: |
| pvr_pds_compute_program_destroy(device, |
| allocator, |
| &compute_pipeline->primary_program, |
| &compute_pipeline->primary_program_info); |
| |
| err_free_descriptor_program: |
| pvr_pds_descriptor_program_destroy(device, |
| allocator, |
| &compute_pipeline->descriptor_state); |
| |
| err_free_shader: |
| pvr_bo_suballoc_free(compute_pipeline->shader_state.bo); |
| |
| return result; |
| } |
| |
| static VkResult |
| pvr_compute_pipeline_init(struct pvr_device *device, |
| struct vk_pipeline_cache *cache, |
| const VkComputePipelineCreateInfo *pCreateInfo, |
| const VkAllocationCallbacks *allocator, |
| struct pvr_compute_pipeline *compute_pipeline) |
| { |
| VkResult result; |
| |
| pvr_pipeline_init(device, |
| PVR_PIPELINE_TYPE_COMPUTE, |
| pCreateInfo->layout, |
| &compute_pipeline->base); |
| |
| result = pvr_compute_pipeline_compile(device, |
| cache, |
| pCreateInfo, |
| allocator, |
| compute_pipeline); |
| if (result != VK_SUCCESS) { |
| pvr_pipeline_finish(device, &compute_pipeline->base); |
| return result; |
| } |
| |
| return VK_SUCCESS; |
| } |
| |
| static VkResult |
| pvr_compute_pipeline_create(struct pvr_device *device, |
| struct vk_pipeline_cache *cache, |
| const VkComputePipelineCreateInfo *pCreateInfo, |
| const VkAllocationCallbacks *allocator, |
| VkPipeline *const pipeline_out) |
| { |
| struct pvr_compute_pipeline *compute_pipeline; |
| VkResult result; |
| |
| compute_pipeline = vk_zalloc2(&device->vk.alloc, |
| allocator, |
| sizeof(*compute_pipeline), |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); |
| if (!compute_pipeline) |
| return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| |
| /* Compiles and uploads shaders and PDS programs. */ |
| result = pvr_compute_pipeline_init(device, |
| cache, |
| pCreateInfo, |
| allocator, |
| compute_pipeline); |
| if (result != VK_SUCCESS) { |
| vk_free2(&device->vk.alloc, allocator, compute_pipeline); |
| return result; |
| } |
| |
| *pipeline_out = pvr_pipeline_to_handle(&compute_pipeline->base); |
| |
| return VK_SUCCESS; |
| } |
| |
| static void pvr_compute_pipeline_destroy( |
| struct pvr_device *const device, |
| const VkAllocationCallbacks *const allocator, |
| struct pvr_compute_pipeline *const compute_pipeline) |
| { |
| if (compute_pipeline->flags.base_workgroup) { |
| pvr_pds_compute_base_workgroup_variant_program_finish( |
| device, |
| allocator, |
| &compute_pipeline->primary_base_workgroup_variant_program); |
| } |
| |
| pvr_pds_compute_program_destroy(device, |
| allocator, |
| &compute_pipeline->primary_program, |
| &compute_pipeline->primary_program_info); |
| pvr_pds_descriptor_program_destroy(device, |
| allocator, |
| &compute_pipeline->descriptor_state); |
| pvr_bo_suballoc_free(compute_pipeline->shader_state.bo); |
| |
| pvr_pipeline_finish(device, &compute_pipeline->base); |
| |
| vk_free2(&device->vk.alloc, allocator, compute_pipeline); |
| } |
| |
| VkResult |
| pvr_CreateComputePipelines(VkDevice _device, |
| VkPipelineCache pipelineCache, |
| uint32_t createInfoCount, |
| const VkComputePipelineCreateInfo *pCreateInfos, |
| const VkAllocationCallbacks *pAllocator, |
| VkPipeline *pPipelines) |
| { |
| VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache); |
| PVR_FROM_HANDLE(pvr_device, device, _device); |
| VkResult result = VK_SUCCESS; |
| |
| for (uint32_t i = 0; i < createInfoCount; i++) { |
| const VkResult local_result = |
| pvr_compute_pipeline_create(device, |
| cache, |
| &pCreateInfos[i], |
| pAllocator, |
| &pPipelines[i]); |
| if (local_result != VK_SUCCESS) { |
| result = local_result; |
| pPipelines[i] = VK_NULL_HANDLE; |
| } |
| } |
| |
| return result; |
| } |
| |
| /****************************************************************************** |
| Graphics pipeline functions |
| ******************************************************************************/ |
| |
| static void pvr_pipeline_destroy_shader_data(pco_data *data) |
| { |
| for (unsigned u = 0; u < ARRAY_SIZE(data->common.desc_sets); ++u) |
| if (data->common.desc_sets[u].bindings) |
| ralloc_free(data->common.desc_sets[u].bindings); |
| } |
| |
| static void |
| pvr_graphics_pipeline_destroy(struct pvr_device *const device, |
| const VkAllocationCallbacks *const allocator, |
| struct pvr_graphics_pipeline *const gfx_pipeline) |
| { |
| const uint32_t num_vertex_attrib_programs = |
| ARRAY_SIZE(gfx_pipeline->shader_state.vertex.pds_attrib_programs); |
| |
| pvr_pds_descriptor_program_destroy( |
| device, |
| allocator, |
| &gfx_pipeline->shader_state.fragment.descriptor_state); |
| |
| pvr_pds_descriptor_program_destroy( |
| device, |
| allocator, |
| &gfx_pipeline->shader_state.vertex.descriptor_state); |
| |
| for (uint32_t i = 0; i < num_vertex_attrib_programs; i++) { |
| struct pvr_pds_attrib_program *const attrib_program = |
| &gfx_pipeline->shader_state.vertex.pds_attrib_programs[i]; |
| |
| pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program); |
| } |
| |
| pvr_bo_suballoc_free( |
| gfx_pipeline->shader_state.fragment.pds_fragment_program.pvr_bo); |
| pvr_bo_suballoc_free( |
| gfx_pipeline->shader_state.fragment.pds_coeff_program.pvr_bo); |
| |
| pvr_bo_suballoc_free(gfx_pipeline->shader_state.fragment.bo); |
| pvr_bo_suballoc_free(gfx_pipeline->shader_state.vertex.bo); |
| |
| pvr_pipeline_finish(device, &gfx_pipeline->base); |
| |
| pvr_pipeline_destroy_shader_data(&gfx_pipeline->vs_data); |
| pvr_pipeline_destroy_shader_data(&gfx_pipeline->fs_data); |
| |
| vk_free2(&device->vk.alloc, allocator, gfx_pipeline); |
| } |
| |
| static void pvr_vertex_state_save(struct pvr_graphics_pipeline *gfx_pipeline, |
| pco_shader *vs) |
| { |
| struct pvr_vertex_shader_state *vertex_state = |
| &gfx_pipeline->shader_state.vertex; |
| |
| const pco_data *shader_data = pco_shader_data(vs); |
| memcpy(&gfx_pipeline->vs_data, shader_data, sizeof(*shader_data)); |
| |
| /* This ends up unused since we'll use the temp_usage for the PDS program we |
| * end up selecting, and the descriptor PDS program doesn't use any temps. |
| * Let's set it to ~0 in case it ever gets used. |
| */ |
| vertex_state->stage_state.pds_temps_count = ~0; |
| } |
| |
| static void pvr_fragment_state_save(struct pvr_graphics_pipeline *gfx_pipeline, |
| pco_shader *fs) |
| { |
| struct pvr_fragment_shader_state *fragment_state = |
| &gfx_pipeline->shader_state.fragment; |
| |
| const pco_data *shader_data = pco_shader_data(fs); |
| memcpy(&gfx_pipeline->fs_data, shader_data, sizeof(*shader_data)); |
| |
| /* TODO: add selection for other values of pass type and sample rate. */ |
| fragment_state->pass_type = ROGUE_TA_PASSTYPE_OPAQUE; |
| fragment_state->sample_rate = ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE; |
| |
| /* We can't initialize it yet since we still need to generate the PDS |
| * programs so set it to `~0` to make sure that we set this up later on. |
| */ |
| fragment_state->stage_state.pds_temps_count = ~0; |
| } |
| |
| static bool pvr_blend_factor_requires_consts(VkBlendFactor factor) |
| { |
| switch (factor) { |
| case VK_BLEND_FACTOR_CONSTANT_COLOR: |
| case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR: |
| case VK_BLEND_FACTOR_CONSTANT_ALPHA: |
| case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA: |
| return true; |
| |
| default: |
| return false; |
| } |
| } |
| |
| /** |
| * \brief Indicates whether dynamic blend constants are needed. |
| * |
| * If the user has specified the blend constants to be dynamic, they might not |
| * necessarily be using them. This function makes sure that they are being used |
| * in order to determine whether we need to upload them later on for the shader |
| * to access them. |
| */ |
| static bool pvr_graphics_pipeline_requires_dynamic_blend_consts( |
| const struct pvr_graphics_pipeline *gfx_pipeline) |
| { |
| const struct vk_dynamic_graphics_state *const state = |
| &gfx_pipeline->dynamic_state; |
| |
| if (BITSET_TEST(state->set, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) |
| return false; |
| |
| for (uint32_t i = 0; i < state->cb.attachment_count; i++) { |
| const struct vk_color_blend_attachment_state *attachment = |
| &state->cb.attachments[i]; |
| |
| const bool has_color_write = |
| attachment->write_mask & |
| (VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | |
| VK_COLOR_COMPONENT_B_BIT); |
| const bool has_alpha_write = attachment->write_mask & |
| VK_COLOR_COMPONENT_A_BIT; |
| |
| if (!attachment->blend_enable || attachment->write_mask == 0) |
| continue; |
| |
| if (has_color_write) { |
| const uint8_t src_color_blend_factor = |
| attachment->src_color_blend_factor; |
| const uint8_t dst_color_blend_factor = |
| attachment->dst_color_blend_factor; |
| |
| if (pvr_blend_factor_requires_consts(src_color_blend_factor) || |
| pvr_blend_factor_requires_consts(dst_color_blend_factor)) { |
| return true; |
| } |
| } |
| |
| if (has_alpha_write) { |
| const uint8_t src_alpha_blend_factor = |
| attachment->src_alpha_blend_factor; |
| const uint8_t dst_alpha_blend_factor = |
| attachment->dst_alpha_blend_factor; |
| |
| if (pvr_blend_factor_requires_consts(src_alpha_blend_factor) || |
| pvr_blend_factor_requires_consts(dst_alpha_blend_factor)) { |
| return true; |
| } |
| } |
| } |
| |
| return false; |
| } |
| |
| #undef PVR_DEV_ADDR_SIZE_IN_SH_REGS |
| |
| static void pvr_graphics_pipeline_setup_vertex_dma( |
| struct pvr_graphics_pipeline *gfx_pipeline, |
| const VkPipelineVertexInputStateCreateInfo *const vertex_input_state, |
| struct pvr_pds_vertex_dma *const dma_descriptions, |
| uint32_t *const dma_count) |
| { |
| pco_vs_data *vs_data = &gfx_pipeline->vs_data.vs; |
| |
| const VkVertexInputBindingDescription |
| *sorted_bindings[PVR_MAX_VERTEX_INPUT_BINDINGS] = { 0 }; |
| const VkVertexInputAttributeDescription |
| *sorted_attributes[PVR_MAX_VERTEX_INPUT_BINDINGS] = { 0 }; |
| |
| /* Vertex attributes map to the `layout(location = x)` annotation in the |
| * shader where `x` is the attribute's location. |
| * Vertex bindings have NO relation to the shader. They have nothing to do |
| * with the `layout(set = x, binding = y)` notation. They instead indicate |
| * where the data for a collection of vertex attributes comes from. The |
| * application binds a VkBuffer with vkCmdBindVertexBuffers() to a specific |
| * binding number and based on that we'll know which buffer to DMA the data |
| * from, to fill in the collection of vertex attributes. |
| */ |
| |
| for (uint32_t i = 0; i < vertex_input_state->vertexBindingDescriptionCount; |
| i++) { |
| const VkVertexInputBindingDescription *binding_desc = |
| &vertex_input_state->pVertexBindingDescriptions[i]; |
| |
| sorted_bindings[binding_desc->binding] = binding_desc; |
| } |
| |
| for (uint32_t i = 0; i < vertex_input_state->vertexAttributeDescriptionCount; |
| i++) { |
| const VkVertexInputAttributeDescription *attribute_desc = |
| &vertex_input_state->pVertexAttributeDescriptions[i]; |
| |
| sorted_attributes[attribute_desc->location] = attribute_desc; |
| } |
| |
| for (uint32_t i = 0; i < vertex_input_state->vertexAttributeDescriptionCount; |
| i++) { |
| const VkVertexInputAttributeDescription *attribute = sorted_attributes[i]; |
| if (!attribute) |
| continue; |
| |
| gl_vert_attrib location = attribute->location + VERT_ATTRIB_GENERIC0; |
| const VkVertexInputBindingDescription *binding = |
| sorted_bindings[attribute->binding]; |
| struct pvr_pds_vertex_dma *dma_desc = &dma_descriptions[*dma_count]; |
| const struct util_format_description *fmt_description = |
| vk_format_description(attribute->format); |
| |
| const pco_range *attrib_range = &vs_data->attribs[location]; |
| |
| /* Skip unused attributes. */ |
| if (!attrib_range->count) |
| continue; |
| |
| /* DMA setup. */ |
| |
| /* The PDS program sets up DDMADs to DMA attributes into vtxin regs. |
| * |
| * DDMAD -> Multiply, add, and DOUTD (i.e. DMA from that address). |
| * DMA source addr = src0 * src1 + src2 |
| * DMA params = src3 |
| * |
| * In the PDS program we setup src0 with the binding's stride and src1 |
| * with either the instance id or vertex id (both of which get filled by |
| * the hardware). We setup src2 later on once we know which VkBuffer to |
| * DMA the data from so it's saved for later when we patch the data |
| * section. |
| */ |
| |
| /* TODO: Right now we're setting up a DMA per attribute. In a case where |
| * there are multiple attributes packed into a single binding with |
| * adjacent locations we'd still be DMAing them separately. This is not |
| * great so the DMA setup should be smarter and could do with some |
| * optimization. |
| */ |
| |
| *dma_desc = (struct pvr_pds_vertex_dma){ 0 }; |
| |
| /* In relation to the Vulkan spec. 22.4. Vertex Input Address Calculation |
| * this corresponds to `attribDesc.offset`. |
| * The PDS program doesn't do anything with it but just save it in the |
| * PDS program entry. |
| */ |
| dma_desc->offset = attribute->offset; |
| |
| /* In relation to the Vulkan spec. 22.4. Vertex Input Address Calculation |
| * this corresponds to `bindingDesc.stride`. |
| * The PDS program will calculate the `effectiveVertexOffset` with this |
| * and add it to the address provided in the patched data segment. |
| */ |
| dma_desc->stride = binding->stride; |
| |
| if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) |
| dma_desc->flags = PVR_PDS_VERTEX_DMA_FLAGS_INSTANCE_RATE; |
| else |
| dma_desc->flags = 0; |
| |
| /* Size to DMA per vertex attribute. Used to setup src3 in the DDMAD. */ |
| dma_desc->size_in_dwords = attrib_range->count; |
| |
| /* Vtxin reg offset to start DMAing into. */ |
| dma_desc->destination = attrib_range->start; |
| |
| /* Will be used by the driver to figure out buffer address to patch in the |
| * data section. I.e. which binding we should DMA from. |
| */ |
| dma_desc->binding_index = attribute->binding; |
| |
| /* We don't currently support VK_EXT_vertex_attribute_divisor so no |
| * repeating of instance-rate vertex attributes needed. We should always |
| * move on to the next vertex attribute. |
| */ |
| assert(binding->inputRate != VK_VERTEX_INPUT_RATE_INSTANCE); |
| dma_desc->divisor = 1; |
| |
| /* Will be used to generate PDS code that takes care of robust buffer |
| * access, and later on by the driver to write the correct robustness |
| * buffer address to DMA the fallback values from. |
| */ |
| dma_desc->robustness_buffer_offset = |
| pvr_get_robustness_buffer_format_offset(attribute->format); |
| |
| /* Used by later on by the driver to figure out if the buffer is being |
| * accessed out of bounds, for robust buffer access. |
| */ |
| dma_desc->component_size_in_bytes = |
| fmt_description->block.bits / fmt_description->nr_channels / 8; |
| |
| ++*dma_count; |
| } |
| } |
| |
| static void pvr_graphics_pipeline_setup_fragment_coeff_program( |
| struct pvr_graphics_pipeline *gfx_pipeline, |
| nir_shader *fs, |
| struct pvr_pds_coeff_loading_program *frag_coeff_program) |
| { |
| uint64_t varyings_used = fs->info.inputs_read & |
| BITFIELD64_RANGE(VARYING_SLOT_VAR0, MAX_VARYING); |
| pco_vs_data *vs_data = &gfx_pipeline->vs_data.vs; |
| pco_fs_data *fs_data = &gfx_pipeline->fs_data.fs; |
| |
| unsigned fpu = 0; |
| unsigned dest = 0; |
| |
| if (fs_data->uses.z) { |
| pvr_csb_pack (&frag_coeff_program->FPU_iterators[fpu], |
| PDSINST_DOUT_FIELDS_DOUTI_SRC, |
| douti_src) { |
| /* TODO: define instead of sizeof(uint16_t). */ |
| douti_src.f32_offset = fs_data->uses.w ? 1 * sizeof(uint16_t) : 0; |
| douti_src.f16_offset = douti_src.f32_offset; |
| douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD; |
| douti_src.size = ROGUE_PDSINST_DOUTI_SIZE_1D; |
| } |
| |
| frag_coeff_program->destination[fpu++] = dest++; |
| } |
| |
| if (fs_data->uses.w) { |
| pvr_csb_pack (&frag_coeff_program->FPU_iterators[fpu], |
| PDSINST_DOUT_FIELDS_DOUTI_SRC, |
| douti_src) { |
| douti_src.f32_offset = 0; |
| douti_src.f16_offset = douti_src.f32_offset; |
| douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD; |
| douti_src.size = ROGUE_PDSINST_DOUTI_SIZE_1D; |
| } |
| |
| frag_coeff_program->destination[fpu++] = dest++; |
| } |
| |
| if (fs_data->uses.pntc) { |
| pvr_csb_pack (&frag_coeff_program->FPU_iterators[fpu], |
| PDSINST_DOUT_FIELDS_DOUTI_SRC, |
| douti_src) { |
| douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD; |
| douti_src.size = ROGUE_PDSINST_DOUTI_SIZE_2D; |
| douti_src.pointsprite = true; |
| } |
| |
| frag_coeff_program->destination[fpu++] = dest; |
| dest += 2; |
| } |
| |
| u_foreach_bit64 (varying, varyings_used) { |
| nir_variable *var = |
| nir_find_variable_with_location(fs, nir_var_shader_in, varying); |
| assert(var); |
| |
| pco_range *cf_range = &fs_data->varyings[varying]; |
| assert(cf_range->count > 0); |
| assert(!(cf_range->start % ROGUE_USC_COEFFICIENT_SET_SIZE)); |
| assert(!(cf_range->count % ROGUE_USC_COEFFICIENT_SET_SIZE)); |
| |
| pco_range *vtxout_range = &vs_data->varyings[varying]; |
| assert(vtxout_range->count > 0); |
| assert(vtxout_range->start >= 4); |
| |
| assert(vtxout_range->count == |
| cf_range->count / ROGUE_USC_COEFFICIENT_SET_SIZE); |
| |
| unsigned count = vtxout_range->count; |
| |
| unsigned vtxout = vtxout_range->start; |
| |
| /* pos.x, pos.y unused. */ |
| vtxout -= 2; |
| |
| /* pos.z unused. */ |
| if (!fs_data->uses.z) |
| vtxout -= 1; |
| |
| /* pos.w unused. */ |
| if (!fs_data->uses.w) |
| vtxout -= 1; |
| |
| pvr_csb_pack (&frag_coeff_program->FPU_iterators[fpu], |
| PDSINST_DOUT_FIELDS_DOUTI_SRC, |
| douti_src) { |
| /* TODO: define instead of sizeof(uint16_t). */ |
| douti_src.f32_offset = vtxout * sizeof(uint16_t); |
| /* TODO: f16 support. */ |
| douti_src.f16 = false; |
| douti_src.f16_offset = douti_src.f32_offset; |
| |
| switch (var->data.interpolation) { |
| case INTERP_MODE_SMOOTH: |
| douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD; |
| douti_src.perspective = true; |
| break; |
| |
| case INTERP_MODE_NOPERSPECTIVE: |
| douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD; |
| break; |
| |
| case INTERP_MODE_FLAT: |
| /* TODO: triangle fan, provoking vertex last. */ |
| douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_FLAT_VERTEX0; |
| break; |
| |
| default: |
| unreachable("Unimplemented interpolation type."); |
| } |
| |
| douti_src.size = ROGUE_PDSINST_DOUTI_SIZE_1D + count - 1; |
| } |
| |
| frag_coeff_program->destination[fpu++] = |
| cf_range->start / ROGUE_USC_COEFFICIENT_SET_SIZE; |
| } |
| |
| frag_coeff_program->num_fpu_iterators = fpu; |
| } |
| |
| static void set_var(pco_range *allocation_list, |
| unsigned to, |
| nir_variable *var, |
| unsigned dwords_each) |
| { |
| unsigned slots = glsl_count_dword_slots(var->type, false); |
| |
| allocation_list[var->data.location] = (pco_range){ |
| .start = to, |
| .count = slots * dwords_each, |
| }; |
| } |
| |
| static void allocate_var(pco_range *allocation_list, |
| unsigned *counter, |
| nir_variable *var, |
| unsigned dwords_each) |
| { |
| unsigned slots = glsl_count_dword_slots(var->type, false); |
| |
| allocation_list[var->data.location] = (pco_range){ |
| .start = *counter, |
| .count = slots * dwords_each, |
| }; |
| |
| *counter += slots * dwords_each; |
| } |
| |
| static void try_allocate_var(pco_range *allocation_list, |
| unsigned *counter, |
| nir_shader *nir, |
| uint64_t bitset, |
| nir_variable_mode mode, |
| int location, |
| unsigned dwords_each) |
| { |
| nir_variable *var = nir_find_variable_with_location(nir, mode, location); |
| |
| if (!(bitset & BITFIELD64_BIT(location))) |
| return; |
| |
| assert(var); |
| |
| allocate_var(allocation_list, counter, var, dwords_each); |
| } |
| |
| static void try_allocate_vars(pco_range *allocation_list, |
| unsigned *counter, |
| nir_shader *nir, |
| uint64_t *bitset, |
| nir_variable_mode mode, |
| bool f16, |
| enum glsl_interp_mode interp_mode, |
| unsigned dwords_each) |
| { |
| uint64_t skipped = 0; |
| |
| while (*bitset) { |
| int location = u_bit_scan64(bitset); |
| |
| nir_variable *var = nir_find_variable_with_location(nir, mode, location); |
| assert(var); |
| |
| if (glsl_type_is_16bit(glsl_without_array_or_matrix(var->type)) != f16 || |
| var->data.interpolation != interp_mode) { |
| skipped |= BITFIELD64_BIT(location); |
| continue; |
| } |
| |
| allocate_var(allocation_list, counter, var, dwords_each); |
| } |
| |
| *bitset |= skipped; |
| } |
| |
| static void allocate_val(pco_range *allocation_list, |
| unsigned *counter, |
| unsigned location, |
| unsigned dwords_each) |
| { |
| allocation_list[location] = (pco_range){ |
| .start = *counter, |
| .count = dwords_each, |
| }; |
| |
| *counter += dwords_each; |
| } |
| |
| static void pvr_alloc_vs_sysvals(pco_data *data, nir_shader *nir) |
| { |
| BITSET_DECLARE(system_values_read, SYSTEM_VALUE_MAX); |
| BITSET_COPY(system_values_read, nir->info.system_values_read); |
| |
| gl_system_value sys_vals[] = { |
| SYSTEM_VALUE_VERTEX_ID, SYSTEM_VALUE_INSTANCE_ID, |
| SYSTEM_VALUE_BASE_INSTANCE, SYSTEM_VALUE_BASE_VERTEX, |
| SYSTEM_VALUE_DRAW_ID, |
| }; |
| |
| for (unsigned u = 0; u < ARRAY_SIZE(sys_vals); ++u) { |
| if (BITSET_TEST(system_values_read, sys_vals[u])) { |
| nir_intrinsic_op op = nir_intrinsic_from_system_value(sys_vals[u]); |
| unsigned dwords = nir_intrinsic_infos[op].dest_components; |
| assert(dwords > 0); |
| |
| allocate_val(data->common.sys_vals, |
| &data->common.vtxins, |
| sys_vals[u], |
| dwords); |
| |
| BITSET_CLEAR(system_values_read, sys_vals[u]); |
| } |
| } |
| |
| assert(BITSET_IS_EMPTY(system_values_read)); |
| } |
| |
| static void pvr_init_vs_attribs( |
| pco_data *data, |
| const VkPipelineVertexInputStateCreateInfo *const vertex_input_state) |
| { |
| for (unsigned u = 0; u < vertex_input_state->vertexAttributeDescriptionCount; |
| ++u) { |
| const VkVertexInputAttributeDescription *attrib = |
| &vertex_input_state->pVertexAttributeDescriptions[u]; |
| |
| gl_vert_attrib location = attrib->location + VERT_ATTRIB_GENERIC0; |
| |
| data->vs.attrib_formats[location] = |
| vk_format_to_pipe_format(attrib->format); |
| } |
| } |
| |
| static void pvr_alloc_vs_attribs(pco_data *data, nir_shader *nir) |
| { |
| /* TODO NEXT: this should be based on the format size. */ |
| nir_foreach_shader_in_variable (var, nir) { |
| allocate_var(data->vs.attribs, &data->common.vtxins, var, 1); |
| } |
| } |
| |
| static void pvr_alloc_vs_varyings(pco_data *data, nir_shader *nir) |
| { |
| uint64_t vars_mask = nir->info.outputs_written & |
| BITFIELD64_RANGE(VARYING_SLOT_VAR0, MAX_VARYING); |
| |
| /* Output position must be present. */ |
| assert(nir_find_variable_with_location(nir, |
| nir_var_shader_out, |
| VARYING_SLOT_POS)); |
| |
| /* Varying ordering is specific. */ |
| try_allocate_var(data->vs.varyings, |
| &data->vs.vtxouts, |
| nir, |
| nir->info.outputs_written, |
| nir_var_shader_out, |
| VARYING_SLOT_POS, |
| 1); |
| |
| /* Save varying counts. */ |
| u_foreach_bit64 (location, vars_mask) { |
| nir_variable *var = |
| nir_find_variable_with_location(nir, nir_var_shader_out, location); |
| assert(var); |
| |
| /* TODO: f16 support. */ |
| bool f16 = glsl_type_is_16bit(glsl_without_array_or_matrix(var->type)); |
| assert(!f16); |
| unsigned components = glsl_get_components(var->type); |
| |
| switch (var->data.interpolation) { |
| case INTERP_MODE_SMOOTH: |
| if (f16) |
| data->vs.f16_smooth += components; |
| else |
| data->vs.f32_smooth += components; |
| |
| break; |
| |
| case INTERP_MODE_FLAT: |
| if (f16) |
| data->vs.f16_flat += components; |
| else |
| data->vs.f32_flat += components; |
| |
| break; |
| |
| case INTERP_MODE_NOPERSPECTIVE: |
| if (f16) |
| data->vs.f16_npc += components; |
| else |
| data->vs.f32_npc += components; |
| |
| break; |
| |
| default: |
| unreachable(""); |
| } |
| } |
| |
| for (unsigned f16 = 0; f16 <= 1; ++f16) { |
| for (enum glsl_interp_mode interp_mode = INTERP_MODE_SMOOTH; |
| interp_mode <= INTERP_MODE_NOPERSPECTIVE; |
| ++interp_mode) { |
| try_allocate_vars(data->vs.varyings, |
| &data->vs.vtxouts, |
| nir, |
| &vars_mask, |
| nir_var_shader_out, |
| f16, |
| interp_mode, |
| 1); |
| } |
| } |
| |
| assert(!vars_mask); |
| |
| const gl_varying_slot last_slots[] = { |
| VARYING_SLOT_PSIZ, |
| VARYING_SLOT_VIEWPORT, |
| VARYING_SLOT_LAYER, |
| }; |
| |
| for (unsigned u = 0; u < ARRAY_SIZE(last_slots); ++u) { |
| try_allocate_var(data->vs.varyings, |
| &data->vs.vtxouts, |
| nir, |
| nir->info.outputs_written, |
| nir_var_shader_out, |
| last_slots[u], |
| 1); |
| } |
| } |
| |
| static void pvr_alloc_fs_sysvals(pco_data *data, nir_shader *nir) |
| { |
| /* TODO */ |
| } |
| |
| static void pvr_alloc_fs_varyings(pco_data *data, nir_shader *nir) |
| { |
| assert(!data->common.coeffs); |
| |
| /* Save the z/w locations. */ |
| unsigned zw_count = !!data->fs.uses.z + !!data->fs.uses.w; |
| allocate_val(data->fs.varyings, |
| &data->common.coeffs, |
| VARYING_SLOT_POS, |
| zw_count * ROGUE_USC_COEFFICIENT_SET_SIZE); |
| |
| /* If point coords are used, they come after z/w (if present). */ |
| nir_variable *var = nir_find_variable_with_location(nir, |
| nir_var_shader_in, |
| VARYING_SLOT_PNTC); |
| if (var) { |
| assert(!var->data.location_frac); |
| unsigned count = glsl_get_components(var->type); |
| assert(count == 2); |
| |
| allocate_var(data->fs.varyings, |
| &data->common.coeffs, |
| var, |
| ROGUE_USC_COEFFICIENT_SET_SIZE); |
| |
| data->fs.uses.pntc = true; |
| } |
| |
| /* Allocate the rest of the input varyings. */ |
| nir_foreach_shader_in_variable (var, nir) { |
| /* Already handled. */ |
| if (var->data.location == VARYING_SLOT_POS || |
| var->data.location == VARYING_SLOT_PNTC) |
| continue; |
| |
| allocate_var(data->fs.varyings, |
| &data->common.coeffs, |
| var, |
| ROGUE_USC_COEFFICIENT_SET_SIZE); |
| } |
| } |
| |
| static void |
| pvr_init_fs_outputs(pco_data *data, |
| const struct pvr_render_pass *pass, |
| const struct pvr_render_subpass *const subpass, |
| const struct pvr_renderpass_hwsetup_subpass *hw_subpass) |
| { |
| for (unsigned u = 0; u < subpass->color_count; ++u) { |
| unsigned idx = subpass->color_attachments[u]; |
| if (idx == VK_ATTACHMENT_UNUSED) |
| continue; |
| |
| gl_frag_result location = FRAG_RESULT_DATA0 + u; |
| VkFormat vk_format = pass->attachments[idx].vk_format; |
| data->fs.output_formats[location] = vk_format_to_pipe_format(vk_format); |
| } |
| |
| /* TODO: z-replicate. */ |
| } |
| |
| static void |
| pvr_setup_fs_outputs(pco_data *data, |
| nir_shader *nir, |
| const struct pvr_render_subpass *const subpass, |
| const struct pvr_renderpass_hwsetup_subpass *hw_subpass) |
| { |
| ASSERTED unsigned num_outputs = hw_subpass->setup.num_render_targets; |
| assert(num_outputs == subpass->color_count); |
| |
| uint64_t outputs_written = nir->info.outputs_written; |
| assert(util_bitcount64(outputs_written) == num_outputs); |
| |
| for (unsigned u = 0; u < subpass->color_count; ++u) { |
| gl_frag_result location = FRAG_RESULT_DATA0 + u; |
| unsigned idx = subpass->color_attachments[u]; |
| const struct usc_mrt_resource *mrt_resource; |
| ASSERTED bool output_reg; |
| enum pipe_format format; |
| unsigned format_bits; |
| nir_variable *var; |
| |
| if (idx == VK_ATTACHMENT_UNUSED) |
| continue; |
| |
| assert(u == idx); /* TODO: not sure if this is true or not... */ |
| |
| mrt_resource = &hw_subpass->setup.mrt_resources[u]; |
| output_reg = mrt_resource->type == USC_MRT_RESOURCE_TYPE_OUTPUT_REG; |
| |
| assert(output_reg); |
| /* TODO: tile buffer support. */ |
| |
| var = nir_find_variable_with_location(nir, nir_var_shader_out, location); |
| assert(var); |
| |
| format = data->fs.output_formats[location]; |
| format_bits = util_format_get_blocksizebits(format); |
| /* TODO: other sized formats. */ |
| assert(!(format_bits % 32)); |
| |
| assert(mrt_resource->intermediate_size == format_bits / 8); |
| |
| set_var(data->fs.outputs, |
| mrt_resource->reg.output_reg, |
| var, |
| format_bits / 32); |
| data->fs.output_reg[location] = output_reg; |
| |
| outputs_written &= ~BITFIELD64_BIT(location); |
| } |
| |
| /* TODO: z-replicate. */ |
| |
| assert(!outputs_written); |
| } |
| |
| static void pvr_init_fs_input_attachments( |
| pco_data *data, |
| const struct pvr_render_subpass *const subpass, |
| const struct pvr_renderpass_hwsetup_subpass *hw_subpass) |
| { |
| pvr_finishme("pvr_init_fs_input_attachments"); |
| } |
| |
| static void pvr_setup_fs_input_attachments( |
| pco_data *data, |
| nir_shader *nir, |
| const struct pvr_render_subpass *const subpass, |
| const struct pvr_renderpass_hwsetup_subpass *hw_subpass) |
| { |
| pvr_finishme("pvr_setup_fs_input_attachments"); |
| } |
| |
| static void pvr_init_descriptors(pco_data *data, |
| nir_shader *nir, |
| struct vk_pipeline_layout *layout) |
| { |
| for (unsigned desc_set = 0; desc_set < layout->set_count; ++desc_set) { |
| const struct pvr_descriptor_set_layout *set_layout = |
| vk_to_pvr_descriptor_set_layout(layout->set_layouts[desc_set]); |
| pco_descriptor_set_data *desc_set_data = |
| &data->common.desc_sets[desc_set]; |
| |
| /* If the descriptor set isn't for this stage, skip it. */ |
| if (!(BITFIELD_BIT(nir->info.stage) & set_layout->stage_flags)) |
| continue; |
| |
| desc_set_data->binding_count = set_layout->binding_count; |
| desc_set_data->bindings = |
| rzalloc_array_size(NULL, |
| sizeof(*desc_set_data->bindings), |
| set_layout->binding_count); |
| } |
| } |
| |
| static void pvr_setup_descriptors(pco_data *data, |
| nir_shader *nir, |
| struct vk_pipeline_layout *layout) |
| { |
| gl_shader_stage stage = nir->info.stage; |
| |
| for (unsigned desc_set = 0; desc_set < layout->set_count; ++desc_set) { |
| const struct pvr_descriptor_set_layout *set_layout = |
| vk_to_pvr_descriptor_set_layout(layout->set_layouts[desc_set]); |
| const unsigned desc_set_size_dw = set_layout->size / sizeof(uint32_t); |
| pco_descriptor_set_data *desc_set_data = |
| &data->common.desc_sets[desc_set]; |
| pco_range *desc_set_range = &desc_set_data->range; |
| |
| assert(!(set_layout->size % sizeof(uint32_t))); |
| |
| /* If the descriptor set isn't for this stage or is unused, skip it. */ |
| if (!(BITFIELD_BIT(stage) & set_layout->stage_flags)) { |
| assert(!desc_set_data->used); |
| continue; |
| } |
| |
| if (!desc_set_data->used) |
| continue; |
| |
| desc_set_range->start = data->common.shareds; |
| desc_set_range->count = desc_set_size_dw; |
| data->common.shareds += desc_set_size_dw; |
| |
| for (unsigned binding = 0; binding < set_layout->binding_count; |
| ++binding) { |
| const struct pvr_descriptor_set_layout_binding *layout_binding = |
| &set_layout->bindings[binding]; |
| pco_binding_data *binding_data = &desc_set_data->bindings[binding]; |
| |
| binding_data->range = (pco_range){ |
| .start = desc_set_range->start + |
| (layout_binding->offset / sizeof(uint32_t)), |
| .count = |
| (layout_binding->stride * layout_binding->descriptor_count) / |
| sizeof(uint32_t), |
| .stride = layout_binding->stride / sizeof(uint32_t), |
| }; |
| } |
| } |
| assert(data->common.shareds < 256); |
| } |
| |
| static void |
| pvr_preprocess_shader_data(pco_data *data, |
| nir_shader *nir, |
| const VkGraphicsPipelineCreateInfo *pCreateInfo, |
| struct vk_pipeline_layout *layout) |
| { |
| switch (nir->info.stage) { |
| case MESA_SHADER_VERTEX: { |
| const VkPipelineVertexInputStateCreateInfo *const vertex_input_state = |
| pCreateInfo->pVertexInputState; |
| |
| pvr_init_vs_attribs(data, vertex_input_state); |
| break; |
| } |
| |
| case MESA_SHADER_FRAGMENT: { |
| PVR_FROM_HANDLE(pvr_render_pass, pass, pCreateInfo->renderPass); |
| const struct pvr_render_subpass *const subpass = |
| &pass->subpasses[pCreateInfo->subpass]; |
| const struct pvr_renderpass_hw_map *subpass_map = |
| &pass->hw_setup->subpass_map[pCreateInfo->subpass]; |
| const struct pvr_renderpass_hwsetup_subpass *hw_subpass = |
| &pass->hw_setup->renders[subpass_map->render] |
| .subpasses[subpass_map->subpass]; |
| |
| pvr_init_fs_outputs(data, pass, subpass, hw_subpass); |
| pvr_init_fs_input_attachments(data, subpass, hw_subpass); |
| |
| /* TODO: push consts, blend consts, dynamic state, etc. */ |
| break; |
| } |
| |
| default: |
| unreachable(""); |
| } |
| |
| pvr_init_descriptors(data, nir, layout); |
| |
| /* TODO: common things, like large constants being put into shareds. */ |
| } |
| |
| static void |
| pvr_postprocess_shader_data(pco_data *data, |
| nir_shader *nir, |
| const VkGraphicsPipelineCreateInfo *pCreateInfo, |
| struct vk_pipeline_layout *layout) |
| { |
| switch (nir->info.stage) { |
| case MESA_SHADER_VERTEX: { |
| pvr_alloc_vs_sysvals(data, nir); |
| pvr_alloc_vs_attribs(data, nir); |
| pvr_alloc_vs_varyings(data, nir); |
| break; |
| } |
| |
| case MESA_SHADER_FRAGMENT: { |
| PVR_FROM_HANDLE(pvr_render_pass, pass, pCreateInfo->renderPass); |
| const struct pvr_render_subpass *const subpass = |
| &pass->subpasses[pCreateInfo->subpass]; |
| const struct pvr_renderpass_hw_map *subpass_map = |
| &pass->hw_setup->subpass_map[pCreateInfo->subpass]; |
| const struct pvr_renderpass_hwsetup_subpass *hw_subpass = |
| &pass->hw_setup->renders[subpass_map->render] |
| .subpasses[subpass_map->subpass]; |
| |
| pvr_alloc_fs_sysvals(data, nir); |
| pvr_alloc_fs_varyings(data, nir); |
| pvr_setup_fs_outputs(data, nir, subpass, hw_subpass); |
| pvr_setup_fs_input_attachments(data, nir, subpass, hw_subpass); |
| |
| /* TODO: push consts, blend consts, dynamic state, etc. */ |
| break; |
| } |
| |
| default: |
| unreachable(""); |
| } |
| |
| pvr_setup_descriptors(data, nir, layout); |
| |
| /* TODO: common things, like large constants being put into shareds. */ |
| } |
| |
| /* Compiles and uploads shaders and PDS programs. */ |
| static VkResult |
| pvr_graphics_pipeline_compile(struct pvr_device *const device, |
| struct vk_pipeline_cache *cache, |
| const VkGraphicsPipelineCreateInfo *pCreateInfo, |
| const VkAllocationCallbacks *const allocator, |
| struct pvr_graphics_pipeline *const gfx_pipeline) |
| { |
| struct vk_pipeline_layout *layout = gfx_pipeline->base.layout; |
| const uint32_t cache_line_size = |
| rogue_get_slc_cache_line_size(&device->pdevice->dev_info); |
| VkResult result; |
| |
| struct pvr_vertex_shader_state *vertex_state = |
| &gfx_pipeline->shader_state.vertex; |
| struct pvr_fragment_shader_state *fragment_state = |
| &gfx_pipeline->shader_state.fragment; |
| |
| pco_ctx *pco_ctx = device->pdevice->pco_ctx; |
| |
| nir_shader *producer = NULL; |
| nir_shader *consumer = NULL; |
| pco_data shader_data[MESA_SHADER_STAGES] = { 0 }; |
| nir_shader *nir_shaders[MESA_SHADER_STAGES] = { 0 }; |
| pco_shader *pco_shaders[MESA_SHADER_STAGES] = { 0 }; |
| pco_shader **vs = &pco_shaders[MESA_SHADER_VERTEX]; |
| pco_shader **fs = &pco_shaders[MESA_SHADER_FRAGMENT]; |
| void *shader_mem_ctx = ralloc_context(NULL); |
| |
| struct pvr_pds_vertex_dma vtx_dma_descriptions[PVR_MAX_VERTEX_ATTRIB_DMAS]; |
| uint32_t vtx_dma_count = 0; |
| |
| struct pvr_pds_coeff_loading_program frag_coeff_program = { 0 }; |
| |
| for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; ++stage) { |
| size_t stage_index = gfx_pipeline->stage_indices[stage]; |
| |
| /* Skip unused/inactive stages. */ |
| if (stage_index == ~0) |
| continue; |
| |
| result = |
| vk_pipeline_shader_stage_to_nir(&device->vk, |
| gfx_pipeline->base.pipeline_flags, |
| &pCreateInfo->pStages[stage_index], |
| pco_spirv_options(), |
| pco_nir_options(), |
| shader_mem_ctx, |
| &nir_shaders[stage]); |
| if (result != VK_SUCCESS) |
| goto err_free_build_context; |
| |
| pco_preprocess_nir(pco_ctx, nir_shaders[stage]); |
| } |
| |
| for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; ++stage) { |
| if (!nir_shaders[stage]) |
| continue; |
| |
| if (producer) |
| pco_link_nir(pco_ctx, producer, nir_shaders[stage]); |
| |
| producer = nir_shaders[stage]; |
| } |
| |
| for (gl_shader_stage stage = MESA_SHADER_STAGES; stage-- > 0;) { |
| if (!nir_shaders[stage]) |
| continue; |
| |
| if (consumer) |
| pco_rev_link_nir(pco_ctx, nir_shaders[stage], consumer); |
| |
| consumer = nir_shaders[stage]; |
| } |
| |
| for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; ++stage) { |
| if (!nir_shaders[stage]) |
| continue; |
| |
| pvr_preprocess_shader_data(&shader_data[stage], |
| nir_shaders[stage], |
| pCreateInfo, |
| layout); |
| |
| pco_lower_nir(pco_ctx, nir_shaders[stage], &shader_data[stage]); |
| |
| pco_postprocess_nir(pco_ctx, nir_shaders[stage], &shader_data[stage]); |
| |
| pvr_postprocess_shader_data(&shader_data[stage], |
| nir_shaders[stage], |
| pCreateInfo, |
| layout); |
| } |
| |
| for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; ++stage) { |
| pco_shader **pco = &pco_shaders[stage]; |
| |
| /* Skip unused/inactive stages. */ |
| if (!nir_shaders[stage]) |
| continue; |
| |
| *pco = pco_trans_nir(pco_ctx, |
| nir_shaders[stage], |
| &shader_data[stage], |
| shader_mem_ctx); |
| if (!*pco) { |
| result = VK_ERROR_INITIALIZATION_FAILED; |
| goto err_free_build_context; |
| } |
| |
| pco_process_ir(pco_ctx, *pco); |
| pco_encode_ir(pco_ctx, *pco); |
| } |
| |
| pvr_vertex_state_save(gfx_pipeline, *vs); |
| |
| pvr_graphics_pipeline_setup_vertex_dma(gfx_pipeline, |
| pCreateInfo->pVertexInputState, |
| vtx_dma_descriptions, |
| &vtx_dma_count); |
| |
| result = pvr_gpu_upload_usc(device, |
| pco_shader_binary_data(*vs), |
| pco_shader_binary_size(*vs), |
| cache_line_size, |
| &vertex_state->bo); |
| if (result != VK_SUCCESS) |
| goto err_free_build_context; |
| |
| if (*fs) { |
| pvr_fragment_state_save(gfx_pipeline, *fs); |
| |
| pvr_graphics_pipeline_setup_fragment_coeff_program( |
| gfx_pipeline, |
| nir_shaders[MESA_SHADER_FRAGMENT], |
| &frag_coeff_program); |
| |
| result = pvr_gpu_upload_usc(device, |
| pco_shader_binary_data(*fs), |
| pco_shader_binary_size(*fs), |
| cache_line_size, |
| &fragment_state->bo); |
| if (result != VK_SUCCESS) |
| goto err_free_vertex_bo; |
| |
| result = pvr_pds_coeff_program_create_and_upload(device, |
| allocator, |
| &frag_coeff_program, |
| fragment_state); |
| if (result != VK_SUCCESS) |
| goto err_free_fragment_bo; |
| |
| result = pvr_pds_fragment_program_create_and_upload(device, |
| allocator, |
| *fs, |
| fragment_state); |
| if (result != VK_SUCCESS) |
| goto err_free_coeff_program; |
| |
| result = pvr_pds_descriptor_program_create_and_upload( |
| device, |
| allocator, |
| layout, |
| MESA_SHADER_FRAGMENT, |
| &gfx_pipeline->fs_data, |
| &fragment_state->descriptor_state); |
| if (result != VK_SUCCESS) |
| goto err_free_frag_program; |
| |
| /* If not, we need to MAX2() and set |
| * `fragment_state->stage_state.pds_temps_count` appropriately. |
| */ |
| assert(fragment_state->descriptor_state.pds_info.temps_required == 0); |
| } |
| |
| result = pvr_pds_vertex_attrib_programs_create_and_upload( |
| device, |
| allocator, |
| &gfx_pipeline->vs_data, |
| vtx_dma_descriptions, |
| vtx_dma_count, |
| &vertex_state->pds_attrib_programs); |
| if (result != VK_SUCCESS) |
| goto err_free_frag_descriptor_program; |
| |
| result = pvr_pds_descriptor_program_create_and_upload( |
| device, |
| allocator, |
| layout, |
| MESA_SHADER_VERTEX, |
| &gfx_pipeline->vs_data, |
| &vertex_state->descriptor_state); |
| if (result != VK_SUCCESS) |
| goto err_free_vertex_attrib_program; |
| |
| /* FIXME: When the temp_buffer_total_size is non-zero we need to allocate a |
| * scratch buffer for both vertex and fragment stage. |
| * Figure out the best place to do this. |
| */ |
| /* assert(pvr_pds_descriptor_program_variables.temp_buff_total_size == 0); */ |
| /* TODO: Implement spilling with the above. */ |
| |
| ralloc_free(shader_mem_ctx); |
| |
| return VK_SUCCESS; |
| |
| err_free_vertex_attrib_program: |
| for (uint32_t i = 0; i < ARRAY_SIZE(vertex_state->pds_attrib_programs); |
| i++) { |
| struct pvr_pds_attrib_program *const attrib_program = |
| &vertex_state->pds_attrib_programs[i]; |
| |
| pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program); |
| } |
| err_free_frag_descriptor_program: |
| pvr_pds_descriptor_program_destroy(device, |
| allocator, |
| &fragment_state->descriptor_state); |
| err_free_frag_program: |
| pvr_bo_suballoc_free(fragment_state->pds_fragment_program.pvr_bo); |
| err_free_coeff_program: |
| pvr_bo_suballoc_free(fragment_state->pds_coeff_program.pvr_bo); |
| err_free_fragment_bo: |
| pvr_bo_suballoc_free(fragment_state->bo); |
| err_free_vertex_bo: |
| pvr_bo_suballoc_free(vertex_state->bo); |
| err_free_build_context: |
| ralloc_free(shader_mem_ctx); |
| return result; |
| } |
| |
| static struct vk_render_pass_state |
| pvr_create_renderpass_state(const VkGraphicsPipelineCreateInfo *const info) |
| { |
| PVR_FROM_HANDLE(pvr_render_pass, pass, info->renderPass); |
| const struct pvr_render_subpass *const subpass = |
| &pass->subpasses[info->subpass]; |
| |
| enum vk_rp_attachment_flags attachments = 0; |
| |
| assert(info->subpass < pass->subpass_count); |
| |
| for (uint32_t i = 0; i < subpass->color_count; i++) { |
| if (pass->attachments[subpass->color_attachments[i]].aspects) |
| attachments |= MESA_VK_RP_ATTACHMENT_COLOR_0_BIT << i; |
| } |
| |
| if (subpass->depth_stencil_attachment != VK_ATTACHMENT_UNUSED) { |
| VkImageAspectFlags ds_aspects = |
| pass->attachments[subpass->depth_stencil_attachment].aspects; |
| if (ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) |
| attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT; |
| if (ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) |
| attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT; |
| } |
| |
| return (struct vk_render_pass_state){ |
| .attachments = attachments, |
| |
| /* TODO: This is only needed for VK_KHR_create_renderpass2 (or core 1.2), |
| * which is not currently supported. |
| */ |
| .view_mask = 0, |
| }; |
| } |
| |
| static VkResult |
| pvr_graphics_pipeline_init(struct pvr_device *device, |
| struct vk_pipeline_cache *cache, |
| const VkGraphicsPipelineCreateInfo *pCreateInfo, |
| const VkAllocationCallbacks *allocator, |
| struct pvr_graphics_pipeline *gfx_pipeline) |
| { |
| struct vk_dynamic_graphics_state *const dynamic_state = |
| &gfx_pipeline->dynamic_state; |
| const struct vk_render_pass_state rp_state = |
| pvr_create_renderpass_state(pCreateInfo); |
| |
| struct vk_graphics_pipeline_all_state all_state; |
| struct vk_graphics_pipeline_state state = { 0 }; |
| |
| VkResult result; |
| |
| pvr_pipeline_init(device, |
| PVR_PIPELINE_TYPE_GRAPHICS, |
| pCreateInfo->layout, |
| &gfx_pipeline->base); |
| |
| result = vk_graphics_pipeline_state_fill(&device->vk, |
| &state, |
| pCreateInfo, |
| &rp_state, |
| 0, |
| &all_state, |
| NULL, |
| 0, |
| NULL); |
| if (result != VK_SUCCESS) |
| goto err_pipeline_finish; |
| |
| vk_dynamic_graphics_state_init(dynamic_state); |
| |
| /* Load static state into base dynamic state holder. */ |
| vk_dynamic_graphics_state_fill(dynamic_state, &state); |
| |
| /* The value of ms.rasterization_samples is undefined when |
| * rasterizer_discard_enable is set, but we need a specific value. |
| * Fill that in here. |
| */ |
| if (state.rs->rasterizer_discard_enable) |
| dynamic_state->ms.rasterization_samples = VK_SAMPLE_COUNT_1_BIT; |
| |
| memset(gfx_pipeline->stage_indices, ~0, sizeof(gfx_pipeline->stage_indices)); |
| |
| for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) { |
| VkShaderStageFlagBits vk_stage = pCreateInfo->pStages[i].stage; |
| gl_shader_stage gl_stage = vk_to_mesa_shader_stage(vk_stage); |
| /* From the Vulkan 1.2.192 spec for VkPipelineShaderStageCreateInfo: |
| * |
| * "stage must not be VK_SHADER_STAGE_ALL_GRAPHICS, |
| * or VK_SHADER_STAGE_ALL." |
| * |
| * So we don't handle that. |
| * |
| * We also don't handle VK_SHADER_STAGE_TESSELLATION_* and |
| * VK_SHADER_STAGE_GEOMETRY_BIT stages as 'tessellationShader' and |
| * 'geometryShader' are set to false in the VkPhysicalDeviceFeatures |
| * structure returned by the driver. |
| */ |
| switch (pCreateInfo->pStages[i].stage) { |
| case VK_SHADER_STAGE_VERTEX_BIT: |
| case VK_SHADER_STAGE_FRAGMENT_BIT: |
| gfx_pipeline->stage_indices[gl_stage] = i; |
| break; |
| default: |
| unreachable("Unsupported stage."); |
| } |
| } |
| |
| /* Compiles and uploads shaders and PDS programs. */ |
| result = pvr_graphics_pipeline_compile(device, |
| cache, |
| pCreateInfo, |
| allocator, |
| gfx_pipeline); |
| if (result != VK_SUCCESS) |
| goto err_pipeline_finish; |
| |
| return VK_SUCCESS; |
| |
| err_pipeline_finish: |
| pvr_pipeline_finish(device, &gfx_pipeline->base); |
| |
| return result; |
| } |
| |
| /* If allocator == NULL, the internal one will be used. */ |
| static VkResult |
| pvr_graphics_pipeline_create(struct pvr_device *device, |
| struct vk_pipeline_cache *cache, |
| const VkGraphicsPipelineCreateInfo *pCreateInfo, |
| const VkAllocationCallbacks *allocator, |
| VkPipeline *const pipeline_out) |
| { |
| struct pvr_graphics_pipeline *gfx_pipeline; |
| VkResult result; |
| |
| gfx_pipeline = vk_zalloc2(&device->vk.alloc, |
| allocator, |
| sizeof(*gfx_pipeline), |
| 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); |
| if (!gfx_pipeline) |
| return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| |
| /* Compiles and uploads shaders and PDS programs too. */ |
| result = pvr_graphics_pipeline_init(device, |
| cache, |
| pCreateInfo, |
| allocator, |
| gfx_pipeline); |
| if (result != VK_SUCCESS) { |
| vk_free2(&device->vk.alloc, allocator, gfx_pipeline); |
| return result; |
| } |
| |
| *pipeline_out = pvr_pipeline_to_handle(&gfx_pipeline->base); |
| |
| return VK_SUCCESS; |
| } |
| |
| VkResult |
| pvr_CreateGraphicsPipelines(VkDevice _device, |
| VkPipelineCache pipelineCache, |
| uint32_t createInfoCount, |
| const VkGraphicsPipelineCreateInfo *pCreateInfos, |
| const VkAllocationCallbacks *pAllocator, |
| VkPipeline *pPipelines) |
| { |
| VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache); |
| PVR_FROM_HANDLE(pvr_device, device, _device); |
| VkResult result = VK_SUCCESS; |
| |
| for (uint32_t i = 0; i < createInfoCount; i++) { |
| const VkResult local_result = |
| pvr_graphics_pipeline_create(device, |
| cache, |
| &pCreateInfos[i], |
| pAllocator, |
| &pPipelines[i]); |
| if (local_result != VK_SUCCESS) { |
| result = local_result; |
| pPipelines[i] = VK_NULL_HANDLE; |
| } |
| } |
| |
| return result; |
| } |
| |
| /***************************************************************************** |
| Other functions |
| *****************************************************************************/ |
| |
| void pvr_DestroyPipeline(VkDevice _device, |
| VkPipeline _pipeline, |
| const VkAllocationCallbacks *pAllocator) |
| { |
| PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline); |
| PVR_FROM_HANDLE(pvr_device, device, _device); |
| |
| if (!pipeline) |
| return; |
| |
| switch (pipeline->type) { |
| case PVR_PIPELINE_TYPE_GRAPHICS: { |
| struct pvr_graphics_pipeline *const gfx_pipeline = |
| to_pvr_graphics_pipeline(pipeline); |
| |
| pvr_graphics_pipeline_destroy(device, pAllocator, gfx_pipeline); |
| break; |
| } |
| |
| case PVR_PIPELINE_TYPE_COMPUTE: { |
| struct pvr_compute_pipeline *const compute_pipeline = |
| to_pvr_compute_pipeline(pipeline); |
| |
| pvr_compute_pipeline_destroy(device, pAllocator, compute_pipeline); |
| break; |
| } |
| |
| default: |
| unreachable("Unknown pipeline type."); |
| } |
| } |