src/imagination/vulkan/pvr_pipeline.c - third_party/mesa - Git at Google

 /*
  * Copyright © 2022 Imagination Technologies Ltd.
  *
  * based in part on v3dv driver which is:
  * Copyright © 2019 Raspberry Pi
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
  * in the Software without restriction, including without limitation the rights
  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  * copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */

 #include <assert.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <string.h>
 #include <vulkan/vulkan.h>

 #include "compiler/shader_enums.h"
 #include "hwdef/rogue_hw_utils.h"
 #include "nir/nir.h"
 #include "pco/pco.h"
 #include "pco/pco_data.h"
 #include "pvr_bo.h"
 #include "pvr_csb.h"
 #include "pvr_csb_enum_helpers.h"
 #include "pvr_hardcode.h"
 #include "pvr_pds.h"
 #include "pvr_private.h"
 #include "pvr_robustness.h"
 #include "pvr_shader.h"
 #include "pvr_types.h"
 #include "rogue/rogue.h"
 #include "util/log.h"
 #include "util/macros.h"
 #include "util/ralloc.h"
 #include "util/u_dynarray.h"
 #include "util/u_math.h"
 #include "vk_alloc.h"
 #include "vk_format.h"
 #include "vk_graphics_state.h"
 #include "vk_log.h"
 #include "vk_object.h"
 #include "vk_pipeline_cache.h"
 #include "vk_pipeline_layout.h"
 #include "vk_render_pass.h"
 #include "vk_util.h"
 #include "vulkan/runtime/vk_pipeline.h"

 /*****************************************************************************
    PDS functions
 *****************************************************************************/

 /* If allocator == NULL, the internal one will be used. */
 static VkResult pvr_pds_coeff_program_create_and_upload(
    struct pvr_device *device,
    const VkAllocationCallbacks *allocator,
    struct pvr_pds_coeff_loading_program *program,
    struct pvr_fragment_shader_state *fragment_state)
 {
    uint32_t staging_buffer_size;
    uint32_t *staging_buffer;
    VkResult result;

    assert(program->num_fpu_iterators < PVR_MAXIMUM_ITERATIONS);

    /* Get the size of the program and then allocate that much memory. */
    pvr_pds_coefficient_loading(program, NULL, PDS_GENERATE_SIZES);

    if (!program->code_size) {
       fragment_state->pds_coeff_program.pvr_bo = NULL;
       fragment_state->pds_coeff_program.code_size = 0;
       fragment_state->pds_coeff_program.data_size = 0;
       fragment_state->stage_state.pds_temps_count = 0;

       return VK_SUCCESS;
    }

    staging_buffer_size =
       PVR_DW_TO_BYTES(program->code_size + program->data_size);

    staging_buffer = vk_alloc2(&device->vk.alloc,
                               allocator,
                               staging_buffer_size,
                               8,
                               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
    if (!staging_buffer)
       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);

    /* Generate the program into is the staging_buffer. */
    pvr_pds_coefficient_loading(program,
                                staging_buffer,
                                PDS_GENERATE_CODEDATA_SEGMENTS);

    /* FIXME: Figure out the define for alignment of 16. */
    result = pvr_gpu_upload_pds(device,
                                &staging_buffer[0],
                                program->data_size,
                                16,
                                &staging_buffer[program->data_size],
                                program->code_size,
                                16,
                                16,
                                &fragment_state->pds_coeff_program);
    if (result != VK_SUCCESS) {
       vk_free2(&device->vk.alloc, allocator, staging_buffer);
       return result;
    }

    vk_free2(&device->vk.alloc, allocator, staging_buffer);

    fragment_state->stage_state.pds_temps_count = program->temps_used;

    return VK_SUCCESS;
 }

 /* FIXME: move this elsewhere since it's also called in pvr_pass.c? */
 /* If allocator == NULL, the internal one will be used. */
 VkResult pvr_pds_fragment_program_create_and_upload(
    struct pvr_device *device,
    const VkAllocationCallbacks *allocator,
    pco_shader *fs,
    struct pvr_fragment_shader_state *fragment_state)
 {
    /* TODO: remove the below + revert the pvr_pds_setup_doutu
     * args and make sure fs isn't NULL instead;
     * temporarily in place for hardcoded load ops in
     * pvr_pass.c:pvr_generate_load_op_shader()
     */
    unsigned temps = 0;
    bool has_phase_rate_change = false;
    unsigned entry_offset = 0;

    if (fs) {
       pco_data *fs_data = pco_shader_data(fs);
       temps = fs_data->common.temps;
       has_phase_rate_change = fs_data->fs.uses.phase_change;
       entry_offset = fs_data->common.entry_offset;
    }

    struct pvr_pds_kickusc_program program = { 0 };
    uint32_t staging_buffer_size;
    uint32_t *staging_buffer;
    VkResult result;

    const pvr_dev_addr_t exec_addr =
       PVR_DEV_ADDR_OFFSET(fragment_state->bo->dev_addr,
                           /* fs_data->common.entry_offset */ entry_offset);

    /* Note this is not strictly required to be done before calculating the
     * staging_buffer_size in this particular case. It can also be done after
     * allocating the buffer. The size from pvr_pds_kick_usc() is constant.
     */
    pvr_pds_setup_doutu(
       &program.usc_task_control,
       exec_addr.addr,
       /* fs_data->common.temps */ temps,
       fragment_state->sample_rate,
       /* fs_data->fs.uses.phase_change */ has_phase_rate_change);

    pvr_pds_kick_usc(&program, NULL, 0, false, PDS_GENERATE_SIZES);

    staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size);

    staging_buffer = vk_alloc2(&device->vk.alloc,
                               allocator,
                               staging_buffer_size,
                               8,
                               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
    if (!staging_buffer)
       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);

    pvr_pds_kick_usc(&program,
                     staging_buffer,
                     0,
                     false,
                     PDS_GENERATE_CODEDATA_SEGMENTS);

    /* FIXME: Figure out the define for alignment of 16. */
    result = pvr_gpu_upload_pds(device,
                                &staging_buffer[0],
                                program.data_size,
                                16,
                                &staging_buffer[program.data_size],
                                program.code_size,
                                16,
                                16,
                                &fragment_state->pds_fragment_program);
    if (result != VK_SUCCESS) {
       vk_free2(&device->vk.alloc, allocator, staging_buffer);
       return result;
    }

    vk_free2(&device->vk.alloc, allocator, staging_buffer);

    return VK_SUCCESS;
 }

 static inline size_t pvr_pds_get_max_vertex_program_const_map_size_in_bytes(
    const struct pvr_device_info *dev_info,
    bool robust_buffer_access)
 {
    /* FIXME: Use more local variable to improve formatting. */

    /* Maximum memory allocation needed for const map entries in
     * pvr_pds_generate_vertex_primary_program().
     * When robustBufferAccess is disabled, it must be >= 410.
     * When robustBufferAccess is enabled, it must be >= 570.
     *
     * 1. Size of entry for base instance
     *        (pvr_const_map_entry_base_instance)
     *
     * 2. Max. number of vertex inputs (PVR_MAX_VERTEX_INPUT_BINDINGS) * (
     *     if (!robustBufferAccess)
     *         size of vertex attribute entry
     *             (pvr_const_map_entry_vertex_attribute_address) +
     *     else
     *         size of robust vertex attribute entry
     *             (pvr_const_map_entry_robust_vertex_attribute_address) +
     *         size of entry for max attribute index
     *             (pvr_const_map_entry_vertex_attribute_max_index) +
     *     fi
     *     size of Unified Store burst entry
     *         (pvr_const_map_entry_literal32) +
     *     size of entry for vertex stride
     *         (pvr_const_map_entry_literal32) +
     *     size of entries for DDMAD control word
     *         (num_ddmad_literals * pvr_const_map_entry_literal32))
     *
     * 3. Size of entry for DOUTW vertex/instance control word
     *     (pvr_const_map_entry_literal32)
     *
     * 4. Size of DOUTU entry (pvr_const_map_entry_doutu_address)
     */

    const size_t attribute_size =
       (!robust_buffer_access)
          ? sizeof(struct pvr_const_map_entry_vertex_attribute_address)
          : sizeof(struct pvr_const_map_entry_robust_vertex_attribute_address) +
               sizeof(struct pvr_const_map_entry_vertex_attribute_max_index);

    /* If has_pds_ddmadt the DDMAD control word is now a DDMADT control word
     * and is increased by one DWORD to contain the data for the DDMADT's
     * out-of-bounds check.
     */
    const size_t pvr_pds_const_map_vertex_entry_num_ddmad_literals =
       1U + (size_t)PVR_HAS_FEATURE(dev_info, pds_ddmadt);

    return (sizeof(struct pvr_const_map_entry_base_instance) +
            PVR_MAX_VERTEX_INPUT_BINDINGS *
               (attribute_size +
                (2 + pvr_pds_const_map_vertex_entry_num_ddmad_literals) *
                   sizeof(struct pvr_const_map_entry_literal32)) +
            sizeof(struct pvr_const_map_entry_literal32) +
            sizeof(struct pvr_const_map_entry_doutu_address));
 }

 static VkResult pvr_pds_vertex_attrib_program_create_and_upload(
    struct pvr_device *const device,
    const VkAllocationCallbacks *const allocator,
    struct pvr_pds_vertex_primary_program_input *const input,
    struct pvr_pds_attrib_program *const program_out)
 {
    const size_t const_entries_size_in_bytes =
       pvr_pds_get_max_vertex_program_const_map_size_in_bytes(
          &device->pdevice->dev_info,
          device->vk.enabled_features.robustBufferAccess);
    struct pvr_pds_upload *const program = &program_out->program;
    struct pvr_pds_info *const info = &program_out->info;
    struct pvr_const_map_entry *new_entries;
    ASSERTED uint32_t code_size_in_dwords;
    size_t staging_buffer_size;
    uint32_t *staging_buffer;
    VkResult result;

    memset(info, 0, sizeof(*info));

    info->entries = vk_alloc2(&device->vk.alloc,
                              allocator,
                              const_entries_size_in_bytes,
                              8,
                              VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!info->entries) {
       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       goto err_out;
    }

    info->entries_size_in_bytes = const_entries_size_in_bytes;

    pvr_pds_generate_vertex_primary_program(
       input,
       NULL,
       info,
       device->vk.enabled_features.robustBufferAccess,
       &device->pdevice->dev_info);

    code_size_in_dwords = info->code_size_in_dwords;
    staging_buffer_size = PVR_DW_TO_BYTES(info->code_size_in_dwords);

    staging_buffer = vk_alloc2(&device->vk.alloc,
                               allocator,
                               staging_buffer_size,
                               8,
                               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
    if (!staging_buffer) {
       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       goto err_free_entries;
    }

    /* This also fills in info->entries. */
    pvr_pds_generate_vertex_primary_program(
       input,
       staging_buffer,
       info,
       device->vk.enabled_features.robustBufferAccess,
       &device->pdevice->dev_info);

    assert(info->code_size_in_dwords <= code_size_in_dwords);

    /* FIXME: Add a vk_realloc2() ? */
    new_entries = vk_realloc((!allocator) ? &device->vk.alloc : allocator,
                             info->entries,
                             info->entries_written_size_in_bytes,
                             8,
                             VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!new_entries) {
       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       goto err_free_staging_buffer;
    }

    info->entries = new_entries;
    info->entries_size_in_bytes = info->entries_written_size_in_bytes;

    /* FIXME: Figure out the define for alignment of 16. */
    result = pvr_gpu_upload_pds(device,
                                NULL,
                                0,
                                0,
                                staging_buffer,
                                info->code_size_in_dwords,
                                16,
                                16,
                                program);
    if (result != VK_SUCCESS)
       goto err_free_staging_buffer;

    vk_free2(&device->vk.alloc, allocator, staging_buffer);

    return VK_SUCCESS;

 err_free_staging_buffer:
    vk_free2(&device->vk.alloc, allocator, staging_buffer);

 err_free_entries:
    vk_free2(&device->vk.alloc, allocator, info->entries);

 err_out:
    return result;
 }

 static inline void pvr_pds_vertex_attrib_program_destroy(
    struct pvr_device *const device,
    const struct VkAllocationCallbacks *const allocator,
    struct pvr_pds_attrib_program *const program)
 {
    pvr_bo_suballoc_free(program->program.pvr_bo);
    vk_free2(&device->vk.alloc, allocator, program->info.entries);
 }

 /* This is a const pointer to an array of pvr_pds_attrib_program structs.
  * The array being pointed to is of PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT size.
  */
 typedef struct pvr_pds_attrib_program (*const pvr_pds_attrib_programs_array_ptr)
    [PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT];

 /* Generate and uploads a PDS program for DMAing vertex attribs into USC vertex
  * inputs. This will bake the code segment and create a template of the data
  * segment for the command buffer to fill in.
  */
 /* If allocator == NULL, the internal one will be used.
  *
  * programs_out_ptr is a pointer to the array where the outputs will be placed.
  */
 static VkResult pvr_pds_vertex_attrib_programs_create_and_upload(
    struct pvr_device *device,
    const VkAllocationCallbacks *const allocator,
    pco_data *shader_data,
    const struct pvr_pds_vertex_dma
       dma_descriptions[static const PVR_MAX_VERTEX_ATTRIB_DMAS],
    uint32_t dma_count,
    pvr_pds_attrib_programs_array_ptr programs_out_ptr)
 {
    struct pvr_pds_vertex_primary_program_input input = {
       .dma_list = dma_descriptions,
       .dma_count = dma_count,
    };
    uint32_t usc_temp_count = shader_data->common.temps;
    struct pvr_pds_attrib_program *const programs_out = *programs_out_ptr;
    VkResult result;

    pco_range *sys_vals = shader_data->common.sys_vals;
    if (sys_vals[SYSTEM_VALUE_VERTEX_ID].count > 0) {
       input.flags |= PVR_PDS_VERTEX_FLAGS_VERTEX_ID_REQUIRED;
       input.vertex_id_register = sys_vals[SYSTEM_VALUE_VERTEX_ID].start;
    }

    if (sys_vals[SYSTEM_VALUE_INSTANCE_ID].count > 0) {
       input.flags |= PVR_PDS_VERTEX_FLAGS_INSTANCE_ID_REQUIRED;
       input.instance_id_register = sys_vals[SYSTEM_VALUE_INSTANCE_ID].start;
    }

    if (sys_vals[SYSTEM_VALUE_BASE_INSTANCE].count > 0) {
       input.flags |= PVR_PDS_VERTEX_FLAGS_BASE_INSTANCE_REQUIRED;
       input.base_instance_register = sys_vals[SYSTEM_VALUE_BASE_INSTANCE].start;
    }

    if (sys_vals[SYSTEM_VALUE_BASE_VERTEX].count > 0) {
       input.flags |= PVR_PDS_VERTEX_FLAGS_BASE_VERTEX_REQUIRED;
       input.base_vertex_register = sys_vals[SYSTEM_VALUE_BASE_VERTEX].start;
    }

    if (sys_vals[SYSTEM_VALUE_DRAW_ID].count > 0) {
       input.flags |= PVR_PDS_VERTEX_FLAGS_DRAW_INDEX_REQUIRED;
       input.draw_index_register = sys_vals[SYSTEM_VALUE_DRAW_ID].start;
    }

    pvr_pds_setup_doutu(&input.usc_task_control,
                        0,
                        usc_temp_count,
                        ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE,
                        false);

    /* Note: programs_out_ptr is a pointer to an array so this is fine. See the
     * typedef.
     */
    for (uint32_t i = 0; i < ARRAY_SIZE(*programs_out_ptr); i++) {
       uint32_t extra_flags;

       switch (i) {
       case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC:
          extra_flags = 0;
          break;

       case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE:
          extra_flags = PVR_PDS_VERTEX_FLAGS_BASE_INSTANCE_VARIANT;
          break;

       case PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT:
          extra_flags = PVR_PDS_VERTEX_FLAGS_DRAW_INDIRECT_VARIANT;
          break;

       default:
          unreachable("Invalid vertex attrib program type.");
       }

       input.flags |= extra_flags;

       result =
          pvr_pds_vertex_attrib_program_create_and_upload(device,
                                                          allocator,
                                                          &input,
                                                          &programs_out[i]);
       if (result != VK_SUCCESS) {
          for (uint32_t j = 0; j < i; j++) {
             pvr_pds_vertex_attrib_program_destroy(device,
                                                   allocator,
                                                   &programs_out[j]);
          }

          return result;
       }

       input.flags &= ~extra_flags;
    }

    return VK_SUCCESS;
 }

 size_t pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes(void)
 {
    /* Maximum memory allocation needed for const map entries in
     * pvr_pds_generate_descriptor_upload_program().
     * It must be >= 688 bytes. This size is calculated as the sum of:
     *
     *  1. Max. number of descriptor sets (8) * (
     *         size of descriptor entry
     *             (pvr_const_map_entry_descriptor_set) +
     *         size of Common Store burst entry
     *             (pvr_const_map_entry_literal32))
     *
     *  2. Max. number of PDS program buffers (24) * (
     *         size of the largest buffer structure
     *             (pvr_const_map_entry_constant_buffer) +
     *         size of Common Store burst entry
     *             (pvr_const_map_entry_literal32)
     *
     *  3. Size of DOUTU entry (pvr_const_map_entry_doutu_address)
     */

    /* FIXME: PVR_MAX_DESCRIPTOR_SETS is 4 and not 8. The comment above seems to
     * say that it should be 8.
     * Figure our a define for this or is the comment wrong?
     */
    return (8 * (sizeof(struct pvr_const_map_entry_descriptor_set) +
                 sizeof(struct pvr_const_map_entry_literal32)) +
            PVR_PDS_MAX_BUFFERS *
               (sizeof(struct pvr_const_map_entry_constant_buffer) +
                sizeof(struct pvr_const_map_entry_literal32)) +
            sizeof(struct pvr_const_map_entry_doutu_address));
 }

 static VkResult pvr_pds_descriptor_program_create_and_upload(
    struct pvr_device *const device,
    const VkAllocationCallbacks *const allocator,
    const struct vk_pipeline_layout *const layout,
    gl_shader_stage stage,
    pco_data *data,
    struct pvr_stage_allocation_descriptor_state *const descriptor_state)
 {
    const size_t const_entries_size_in_bytes =
       pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes();
    struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
    struct pvr_pds_descriptor_program_input program = { 0 };
    struct pvr_const_map_entry *new_entries;
    ASSERTED uint32_t code_size_in_dwords;
    uint32_t staging_buffer_size;
    uint32_t *staging_buffer;
    VkResult result;

    *pds_info = (struct pvr_pds_info){ 0 };

    for (unsigned desc_set = 0; desc_set < layout->set_count; ++desc_set) {
       const struct pvr_descriptor_set_layout *set_layout =
          vk_to_pvr_descriptor_set_layout(layout->set_layouts[desc_set]);

       const pco_descriptor_set_data *desc_set_data =
          &data->common.desc_sets[desc_set];
       const pco_range *desc_set_range = &desc_set_data->range;

       /* If the descriptor set isn't for this stage or is unused, skip it. */
       if (!(BITFIELD_BIT(stage) & set_layout->stage_flags)) {
          assert(!desc_set_data->used);
          continue;
       }

       if (!desc_set_data->used)
          continue;

       program.descriptor_sets[program.descriptor_set_count] =
          (struct pvr_pds_descriptor_set){
             .descriptor_set = desc_set,
             .size_in_dwords = desc_set_range->count,
             .destination = desc_set_range->start,
          };

       program.descriptor_set_count++;
    }

    pds_info->entries = vk_alloc2(&device->vk.alloc,
                                  allocator,
                                  const_entries_size_in_bytes,
                                  8,
                                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!pds_info->entries) {
       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       goto err_free_static_consts;
    }

    pds_info->entries_size_in_bytes = const_entries_size_in_bytes;

    pvr_pds_generate_descriptor_upload_program(&program, NULL, pds_info);

    code_size_in_dwords = pds_info->code_size_in_dwords;
    staging_buffer_size = PVR_DW_TO_BYTES(pds_info->code_size_in_dwords);

    if (!staging_buffer_size) {
       vk_free2(&device->vk.alloc, allocator, pds_info->entries);

       *descriptor_state = (struct pvr_stage_allocation_descriptor_state){ 0 };

       return VK_SUCCESS;
    }

    staging_buffer = vk_alloc2(&device->vk.alloc,
                               allocator,
                               staging_buffer_size,
                               8,
                               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
    if (!staging_buffer) {
       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       goto err_free_entries;
    }

    pvr_pds_generate_descriptor_upload_program(&program,
                                               staging_buffer,
                                               pds_info);

    assert(pds_info->code_size_in_dwords <= code_size_in_dwords);

    /* FIXME: use vk_realloc2() ? */
    new_entries = vk_realloc((!allocator) ? &device->vk.alloc : allocator,
                             pds_info->entries,
                             pds_info->entries_written_size_in_bytes,
                             8,
                             VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!new_entries) {
       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       goto err_free_staging_buffer;
    }

    pds_info->entries = new_entries;
    pds_info->entries_size_in_bytes = pds_info->entries_written_size_in_bytes;

    /* FIXME: Figure out the define for alignment of 16. */
    result = pvr_gpu_upload_pds(device,
                                NULL,
                                0,
                                0,
                                staging_buffer,
                                pds_info->code_size_in_dwords,
                                16,
                                16,
                                &descriptor_state->pds_code);
    if (result != VK_SUCCESS)
       goto err_free_staging_buffer;

    vk_free2(&device->vk.alloc, allocator, staging_buffer);

    return VK_SUCCESS;

 err_free_staging_buffer:
    vk_free2(&device->vk.alloc, allocator, staging_buffer);

 err_free_entries:
    vk_free2(&device->vk.alloc, allocator, pds_info->entries);

 err_free_static_consts:
    pvr_bo_suballoc_free(descriptor_state->static_consts);

    return result;
 }

 static void pvr_pds_descriptor_program_destroy(
    struct pvr_device *const device,
    const struct VkAllocationCallbacks *const allocator,
    struct pvr_stage_allocation_descriptor_state *const descriptor_state)
 {
    if (!descriptor_state)
       return;

    pvr_bo_suballoc_free(descriptor_state->pds_code.pvr_bo);
    vk_free2(&device->vk.alloc, allocator, descriptor_state->pds_info.entries);
    pvr_bo_suballoc_free(descriptor_state->static_consts);
 }

 static void pvr_pds_compute_program_setup(
    const struct pvr_device_info *dev_info,
    const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
    const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
    uint32_t barrier_coefficient,
    bool add_base_workgroup,
    uint32_t usc_temps,
    pvr_dev_addr_t usc_shader_dev_addr,
    struct pvr_pds_compute_shader_program *const program)
 {
    pvr_pds_compute_shader_program_init(program);
    program->local_input_regs[0] = local_input_regs[0];
    program->local_input_regs[1] = local_input_regs[1];
    program->local_input_regs[2] = local_input_regs[2];
    program->work_group_input_regs[0] = work_group_input_regs[0];
    program->work_group_input_regs[1] = work_group_input_regs[1];
    program->work_group_input_regs[2] = work_group_input_regs[2];
    program->barrier_coefficient = barrier_coefficient;
    program->add_base_workgroup = add_base_workgroup;
    program->flattened_work_groups = true;
    program->kick_usc = true;

    STATIC_ASSERT(ARRAY_SIZE(program->local_input_regs) ==
                  PVR_WORKGROUP_DIMENSIONS);
    STATIC_ASSERT(ARRAY_SIZE(program->work_group_input_regs) ==
                  PVR_WORKGROUP_DIMENSIONS);
    STATIC_ASSERT(ARRAY_SIZE(program->global_input_regs) ==
                  PVR_WORKGROUP_DIMENSIONS);

    pvr_pds_setup_doutu(&program->usc_task_control,
                        usc_shader_dev_addr.addr,
                        usc_temps,
                        ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE,
                        false);

    pvr_pds_compute_shader(program, NULL, PDS_GENERATE_SIZES, dev_info);
 }

 /* FIXME: See if pvr_device_init_compute_pds_program() and this could be merged.
  */
 static VkResult pvr_pds_compute_program_create_and_upload(
    struct pvr_device *const device,
    const VkAllocationCallbacks *const allocator,
    const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
    const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
    uint32_t barrier_coefficient,
    uint32_t usc_temps,
    pvr_dev_addr_t usc_shader_dev_addr,
    struct pvr_pds_upload *const pds_upload_out,
    struct pvr_pds_info *const pds_info_out)
 {
    struct pvr_device_info *dev_info = &device->pdevice->dev_info;
    struct pvr_pds_compute_shader_program program;
    uint32_t staging_buffer_size;
    uint32_t *staging_buffer;
    VkResult result;

    pvr_pds_compute_program_setup(dev_info,
                                  local_input_regs,
                                  work_group_input_regs,
                                  barrier_coefficient,
                                  false,
                                  usc_temps,
                                  usc_shader_dev_addr,
                                  &program);

    /* FIXME: According to pvr_device_init_compute_pds_program() the code size
     * is in bytes. Investigate this.
     */
    staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size);

    staging_buffer = vk_alloc2(&device->vk.alloc,
                               allocator,
                               staging_buffer_size,
                               8,
                               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
    if (!staging_buffer)
       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);

    /* FIXME: pvr_pds_compute_shader doesn't implement
     * PDS_GENERATE_CODEDATA_SEGMENTS.
     */
    pvr_pds_compute_shader(&program,
                           &staging_buffer[0],
                           PDS_GENERATE_CODE_SEGMENT,
                           dev_info);

    pvr_pds_compute_shader(&program,
                           &staging_buffer[program.code_size],
                           PDS_GENERATE_DATA_SEGMENT,
                           dev_info);

    /* FIXME: Figure out the define for alignment of 16. */
    result = pvr_gpu_upload_pds(device,
                                &staging_buffer[program.code_size],
                                program.data_size,
                                16,
                                &staging_buffer[0],
                                program.code_size,
                                16,
                                16,
                                pds_upload_out);
    if (result != VK_SUCCESS) {
       vk_free2(&device->vk.alloc, allocator, staging_buffer);
       return result;
    }

    *pds_info_out = (struct pvr_pds_info){
       .temps_required = program.highest_temp,
       .code_size_in_dwords = program.code_size,
       .data_size_in_dwords = program.data_size,
    };

    vk_free2(&device->vk.alloc, allocator, staging_buffer);

    return VK_SUCCESS;
 };

 static void pvr_pds_compute_program_destroy(
    struct pvr_device *const device,
    const struct VkAllocationCallbacks *const allocator,
    struct pvr_pds_upload *const pds_program,
    struct pvr_pds_info *const pds_info)
 {
    /* We don't allocate an entries buffer so we don't need to free it */
    pvr_bo_suballoc_free(pds_program->pvr_bo);
 }

 /* This only uploads the code segment. The data segment will need to be patched
  * with the base workgroup before uploading.
  */
 static VkResult pvr_pds_compute_base_workgroup_variant_program_init(
    struct pvr_device *const device,
    const VkAllocationCallbacks *const allocator,
    const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
    const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
    uint32_t barrier_coefficient,
    uint32_t usc_temps,
    pvr_dev_addr_t usc_shader_dev_addr,
    struct pvr_pds_base_workgroup_program *program_out)
 {
    struct pvr_device_info *dev_info = &device->pdevice->dev_info;
    struct pvr_pds_compute_shader_program program;
    uint32_t buffer_size;
    uint32_t *buffer;
    VkResult result;

    pvr_pds_compute_program_setup(dev_info,
                                  local_input_regs,
                                  work_group_input_regs,
                                  barrier_coefficient,
                                  true,
                                  usc_temps,
                                  usc_shader_dev_addr,
                                  &program);

    /* FIXME: According to pvr_device_init_compute_pds_program() the code size
     * is in bytes. Investigate this.
     */
    buffer_size = PVR_DW_TO_BYTES(MAX2(program.code_size, program.data_size));

    buffer = vk_alloc2(&device->vk.alloc,
                       allocator,
                       buffer_size,
                       8,
                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!buffer)
       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);

    pvr_pds_compute_shader(&program,
                           &buffer[0],
                           PDS_GENERATE_CODE_SEGMENT,
                           dev_info);

    /* FIXME: Figure out the define for alignment of 16. */
    result = pvr_gpu_upload_pds(device,
                                NULL,
                                0,
                                0,
                                buffer,
                                program.code_size,
                                16,
                                16,
                                &program_out->code_upload);
    if (result != VK_SUCCESS) {
       vk_free2(&device->vk.alloc, allocator, buffer);
       return result;
    }

    pvr_pds_compute_shader(&program, buffer, PDS_GENERATE_DATA_SEGMENT, dev_info);

    program_out->data_section = buffer;

    /* We'll need to patch the base workgroup in the PDS data section before
     * dispatch so we save the offsets at which to patch. We only need to save
     * the offset for the first workgroup id since the workgroup ids are stored
     * contiguously in the data segment.
     */
    program_out->base_workgroup_data_patching_offset =
       program.base_workgroup_constant_offset_in_dwords[0];

    program_out->info = (struct pvr_pds_info){
       .temps_required = program.highest_temp,
       .code_size_in_dwords = program.code_size,
       .data_size_in_dwords = program.data_size,
    };

    return VK_SUCCESS;
 }

 static void pvr_pds_compute_base_workgroup_variant_program_finish(
    struct pvr_device *device,
    const VkAllocationCallbacks *const allocator,
    struct pvr_pds_base_workgroup_program *const state)
 {
    pvr_bo_suballoc_free(state->code_upload.pvr_bo);
    vk_free2(&device->vk.alloc, allocator, state->data_section);
 }

 /******************************************************************************
    Generic pipeline functions
  ******************************************************************************/

 static void pvr_pipeline_init(struct pvr_device *device,
                               enum pvr_pipeline_type type,
                               const VkPipelineLayout layout,
                               struct pvr_pipeline *const pipeline)
 {
    vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);

    pipeline->type = type;

    assert(!pipeline->layout);
    pipeline->layout = vk_pipeline_layout_from_handle(layout);
    vk_pipeline_layout_ref(pipeline->layout);
 }

 static void pvr_pipeline_finish(struct pvr_device *device,
                                 struct pvr_pipeline *pipeline)
 {
    vk_pipeline_layout_unref(&device->vk, pipeline->layout);
    vk_object_base_finish(&pipeline->base);
 }

 /* How many shared regs it takes to store a pvr_dev_addr_t.
  * Each shared reg is 32 bits.
  */
 #define PVR_DEV_ADDR_SIZE_IN_SH_REGS \
    DIV_ROUND_UP(sizeof(pvr_dev_addr_t), sizeof(uint32_t))

 /******************************************************************************
    Compute pipeline functions
  ******************************************************************************/

 /* Compiles and uploads shaders and PDS programs. */
 static VkResult pvr_compute_pipeline_compile(
    struct pvr_device *const device,
    struct vk_pipeline_cache *cache,
    const VkComputePipelineCreateInfo *pCreateInfo,
    const VkAllocationCallbacks *const allocator,
    struct pvr_compute_pipeline *const compute_pipeline)
 {
    struct vk_pipeline_layout *layout = compute_pipeline->base.layout;
    uint32_t work_group_input_regs[PVR_WORKGROUP_DIMENSIONS];
    uint32_t local_input_regs[PVR_WORKGROUP_DIMENSIONS];
    uint32_t barrier_coefficient;
    uint32_t usc_temps;
    VkResult result;

    compute_pipeline->shader_state.const_shared_reg_count = 0;

    /* FIXME: Compile and upload the shader. */
    /* FIXME: Initialize the shader state and setup build info. */
    unreachable("finishme: compute support");

    result = pvr_pds_descriptor_program_create_and_upload(
       device,
       allocator,
       layout,
       MESA_SHADER_COMPUTE,
       NULL,
       &compute_pipeline->descriptor_state);
    if (result != VK_SUCCESS)
       goto err_free_shader;

    result = pvr_pds_compute_program_create_and_upload(
       device,
       allocator,
       local_input_regs,
       work_group_input_regs,
       barrier_coefficient,
       usc_temps,
       compute_pipeline->shader_state.bo->dev_addr,
       &compute_pipeline->primary_program,
       &compute_pipeline->primary_program_info);
    if (result != VK_SUCCESS)
       goto err_free_descriptor_program;

    /* If the workgroup ID is required, then we require the base workgroup
     * variant of the PDS compute program as well.
     */
    compute_pipeline->flags.base_workgroup =
       work_group_input_regs[0] != PVR_PDS_REG_UNUSED ||
       work_group_input_regs[1] != PVR_PDS_REG_UNUSED ||
       work_group_input_regs[2] != PVR_PDS_REG_UNUSED;

    if (compute_pipeline->flags.base_workgroup) {
       result = pvr_pds_compute_base_workgroup_variant_program_init(
          device,
          allocator,
          local_input_regs,
          work_group_input_regs,
          barrier_coefficient,
          usc_temps,
          compute_pipeline->shader_state.bo->dev_addr,
          &compute_pipeline->primary_base_workgroup_variant_program);
       if (result != VK_SUCCESS)
          goto err_destroy_compute_program;
    }

    return VK_SUCCESS;

 err_destroy_compute_program:
    pvr_pds_compute_program_destroy(device,
                                    allocator,
                                    &compute_pipeline->primary_program,
                                    &compute_pipeline->primary_program_info);

 err_free_descriptor_program:
    pvr_pds_descriptor_program_destroy(device,
                                       allocator,
                                       &compute_pipeline->descriptor_state);

 err_free_shader:
    pvr_bo_suballoc_free(compute_pipeline->shader_state.bo);

    return result;
 }

 static VkResult
 pvr_compute_pipeline_init(struct pvr_device *device,
                           struct vk_pipeline_cache *cache,
                           const VkComputePipelineCreateInfo *pCreateInfo,
                           const VkAllocationCallbacks *allocator,
                           struct pvr_compute_pipeline *compute_pipeline)
 {
    VkResult result;

    pvr_pipeline_init(device,
                      PVR_PIPELINE_TYPE_COMPUTE,
                      pCreateInfo->layout,
                      &compute_pipeline->base);

    result = pvr_compute_pipeline_compile(device,
                                          cache,
                                          pCreateInfo,
                                          allocator,
                                          compute_pipeline);
    if (result != VK_SUCCESS) {
       pvr_pipeline_finish(device, &compute_pipeline->base);
       return result;
    }

    return VK_SUCCESS;
 }

 static VkResult
 pvr_compute_pipeline_create(struct pvr_device *device,
                             struct vk_pipeline_cache *cache,
                             const VkComputePipelineCreateInfo *pCreateInfo,
                             const VkAllocationCallbacks *allocator,
                             VkPipeline *const pipeline_out)
 {
    struct pvr_compute_pipeline *compute_pipeline;
    VkResult result;

    compute_pipeline = vk_zalloc2(&device->vk.alloc,
                                  allocator,
                                  sizeof(*compute_pipeline),
                                  8,
                                  VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
    if (!compute_pipeline)
       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);

    /* Compiles and uploads shaders and PDS programs. */
    result = pvr_compute_pipeline_init(device,
                                       cache,
                                       pCreateInfo,
                                       allocator,
                                       compute_pipeline);
    if (result != VK_SUCCESS) {
       vk_free2(&device->vk.alloc, allocator, compute_pipeline);
       return result;
    }

    *pipeline_out = pvr_pipeline_to_handle(&compute_pipeline->base);

    return VK_SUCCESS;
 }

 static void pvr_compute_pipeline_destroy(
    struct pvr_device *const device,
    const VkAllocationCallbacks *const allocator,
    struct pvr_compute_pipeline *const compute_pipeline)
 {
    if (compute_pipeline->flags.base_workgroup) {
       pvr_pds_compute_base_workgroup_variant_program_finish(
          device,
          allocator,
          &compute_pipeline->primary_base_workgroup_variant_program);
    }

    pvr_pds_compute_program_destroy(device,
                                    allocator,
                                    &compute_pipeline->primary_program,
                                    &compute_pipeline->primary_program_info);
    pvr_pds_descriptor_program_destroy(device,
                                       allocator,
                                       &compute_pipeline->descriptor_state);
    pvr_bo_suballoc_free(compute_pipeline->shader_state.bo);

    pvr_pipeline_finish(device, &compute_pipeline->base);

    vk_free2(&device->vk.alloc, allocator, compute_pipeline);
 }

 VkResult
 pvr_CreateComputePipelines(VkDevice _device,
                            VkPipelineCache pipelineCache,
                            uint32_t createInfoCount,
                            const VkComputePipelineCreateInfo *pCreateInfos,
                            const VkAllocationCallbacks *pAllocator,
                            VkPipeline *pPipelines)
 {
    VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
    PVR_FROM_HANDLE(pvr_device, device, _device);
    VkResult result = VK_SUCCESS;

    for (uint32_t i = 0; i < createInfoCount; i++) {
       const VkResult local_result =
          pvr_compute_pipeline_create(device,
                                      cache,
                                      &pCreateInfos[i],
                                      pAllocator,
                                      &pPipelines[i]);
       if (local_result != VK_SUCCESS) {
          result = local_result;
          pPipelines[i] = VK_NULL_HANDLE;
       }
    }

    return result;
 }

 /******************************************************************************
    Graphics pipeline functions
  ******************************************************************************/

 static void pvr_pipeline_destroy_shader_data(pco_data *data)
 {
    for (unsigned u = 0; u < ARRAY_SIZE(data->common.desc_sets); ++u)
       if (data->common.desc_sets[u].bindings)
          ralloc_free(data->common.desc_sets[u].bindings);
 }

 static void
 pvr_graphics_pipeline_destroy(struct pvr_device *const device,
                               const VkAllocationCallbacks *const allocator,
                               struct pvr_graphics_pipeline *const gfx_pipeline)
 {
    const uint32_t num_vertex_attrib_programs =
       ARRAY_SIZE(gfx_pipeline->shader_state.vertex.pds_attrib_programs);

    pvr_pds_descriptor_program_destroy(
       device,
       allocator,
       &gfx_pipeline->shader_state.fragment.descriptor_state);

    pvr_pds_descriptor_program_destroy(
       device,
       allocator,
       &gfx_pipeline->shader_state.vertex.descriptor_state);

    for (uint32_t i = 0; i < num_vertex_attrib_programs; i++) {
       struct pvr_pds_attrib_program *const attrib_program =
          &gfx_pipeline->shader_state.vertex.pds_attrib_programs[i];

       pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program);
    }

    pvr_bo_suballoc_free(
       gfx_pipeline->shader_state.fragment.pds_fragment_program.pvr_bo);
    pvr_bo_suballoc_free(
       gfx_pipeline->shader_state.fragment.pds_coeff_program.pvr_bo);

    pvr_bo_suballoc_free(gfx_pipeline->shader_state.fragment.bo);
    pvr_bo_suballoc_free(gfx_pipeline->shader_state.vertex.bo);

    pvr_pipeline_finish(device, &gfx_pipeline->base);

    pvr_pipeline_destroy_shader_data(&gfx_pipeline->vs_data);
    pvr_pipeline_destroy_shader_data(&gfx_pipeline->fs_data);

    vk_free2(&device->vk.alloc, allocator, gfx_pipeline);
 }

 static void pvr_vertex_state_save(struct pvr_graphics_pipeline *gfx_pipeline,
                                   pco_shader *vs)
 {
    struct pvr_vertex_shader_state *vertex_state =
       &gfx_pipeline->shader_state.vertex;

    const pco_data *shader_data = pco_shader_data(vs);
    memcpy(&gfx_pipeline->vs_data, shader_data, sizeof(*shader_data));

    /* This ends up unused since we'll use the temp_usage for the PDS program we
     * end up selecting, and the descriptor PDS program doesn't use any temps.
     * Let's set it to ~0 in case it ever gets used.
     */
    vertex_state->stage_state.pds_temps_count = ~0;
 }

 static void pvr_fragment_state_save(struct pvr_graphics_pipeline *gfx_pipeline,
                                     pco_shader *fs)
 {
    struct pvr_fragment_shader_state *fragment_state =
       &gfx_pipeline->shader_state.fragment;

    const pco_data *shader_data = pco_shader_data(fs);
    memcpy(&gfx_pipeline->fs_data, shader_data, sizeof(*shader_data));

    /* TODO: add selection for other values of pass type and sample rate. */
    fragment_state->pass_type = ROGUE_TA_PASSTYPE_OPAQUE;
    fragment_state->sample_rate = ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE;

    /* We can't initialize it yet since we still need to generate the PDS
     * programs so set it to `~0` to make sure that we set this up later on.
     */
    fragment_state->stage_state.pds_temps_count = ~0;
 }

 static bool pvr_blend_factor_requires_consts(VkBlendFactor factor)
 {
    switch (factor) {
    case VK_BLEND_FACTOR_CONSTANT_COLOR:
    case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
    case VK_BLEND_FACTOR_CONSTANT_ALPHA:
    case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
       return true;

    default:
       return false;
    }
 }

 /**
  * \brief Indicates whether dynamic blend constants are needed.
  *
  * If the user has specified the blend constants to be dynamic, they might not
  * necessarily be using them. This function makes sure that they are being used
  * in order to determine whether we need to upload them later on for the shader
  * to access them.
  */
 static bool pvr_graphics_pipeline_requires_dynamic_blend_consts(
    const struct pvr_graphics_pipeline *gfx_pipeline)
 {
    const struct vk_dynamic_graphics_state *const state =
       &gfx_pipeline->dynamic_state;

    if (BITSET_TEST(state->set, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS))
       return false;

    for (uint32_t i = 0; i < state->cb.attachment_count; i++) {
       const struct vk_color_blend_attachment_state *attachment =
          &state->cb.attachments[i];

       const bool has_color_write =
          attachment->write_mask &
          (VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT |
           VK_COLOR_COMPONENT_B_BIT);
       const bool has_alpha_write = attachment->write_mask &
                                    VK_COLOR_COMPONENT_A_BIT;

       if (!attachment->blend_enable || attachment->write_mask == 0)
          continue;

       if (has_color_write) {
          const uint8_t src_color_blend_factor =
             attachment->src_color_blend_factor;
          const uint8_t dst_color_blend_factor =
             attachment->dst_color_blend_factor;

          if (pvr_blend_factor_requires_consts(src_color_blend_factor) ||
              pvr_blend_factor_requires_consts(dst_color_blend_factor)) {
             return true;
          }
       }

       if (has_alpha_write) {
          const uint8_t src_alpha_blend_factor =
             attachment->src_alpha_blend_factor;
          const uint8_t dst_alpha_blend_factor =
             attachment->dst_alpha_blend_factor;

          if (pvr_blend_factor_requires_consts(src_alpha_blend_factor) ||
              pvr_blend_factor_requires_consts(dst_alpha_blend_factor)) {
             return true;
          }
       }
    }

    return false;
 }

 #undef PVR_DEV_ADDR_SIZE_IN_SH_REGS

 static void pvr_graphics_pipeline_setup_vertex_dma(
    struct pvr_graphics_pipeline *gfx_pipeline,
    const VkPipelineVertexInputStateCreateInfo *const vertex_input_state,
    struct pvr_pds_vertex_dma *const dma_descriptions,
    uint32_t *const dma_count)
 {
    pco_vs_data *vs_data = &gfx_pipeline->vs_data.vs;

    const VkVertexInputBindingDescription
       *sorted_bindings[PVR_MAX_VERTEX_INPUT_BINDINGS] = { 0 };
    const VkVertexInputAttributeDescription
       *sorted_attributes[PVR_MAX_VERTEX_INPUT_BINDINGS] = { 0 };

    /* Vertex attributes map to the `layout(location = x)` annotation in the
     * shader where `x` is the attribute's location.
     * Vertex bindings have NO relation to the shader. They have nothing to do
     * with the `layout(set = x, binding = y)` notation. They instead indicate
     * where the data for a collection of vertex attributes comes from. The
     * application binds a VkBuffer with vkCmdBindVertexBuffers() to a specific
     * binding number and based on that we'll know which buffer to DMA the data
     * from, to fill in the collection of vertex attributes.
     */

    for (uint32_t i = 0; i < vertex_input_state->vertexBindingDescriptionCount;
         i++) {
       const VkVertexInputBindingDescription *binding_desc =
          &vertex_input_state->pVertexBindingDescriptions[i];

       sorted_bindings[binding_desc->binding] = binding_desc;
    }

    for (uint32_t i = 0; i < vertex_input_state->vertexAttributeDescriptionCount;
         i++) {
       const VkVertexInputAttributeDescription *attribute_desc =
          &vertex_input_state->pVertexAttributeDescriptions[i];

       sorted_attributes[attribute_desc->location] = attribute_desc;
    }

    for (uint32_t i = 0; i < vertex_input_state->vertexAttributeDescriptionCount;
         i++) {
       const VkVertexInputAttributeDescription *attribute = sorted_attributes[i];
       if (!attribute)
          continue;

       gl_vert_attrib location = attribute->location + VERT_ATTRIB_GENERIC0;
       const VkVertexInputBindingDescription *binding =
          sorted_bindings[attribute->binding];
       struct pvr_pds_vertex_dma *dma_desc = &dma_descriptions[*dma_count];
       const struct util_format_description *fmt_description =
          vk_format_description(attribute->format);

       const pco_range *attrib_range = &vs_data->attribs[location];

       /* Skip unused attributes. */
       if (!attrib_range->count)
          continue;

       /* DMA setup. */

       /* The PDS program sets up DDMADs to DMA attributes into vtxin regs.
        *
        * DDMAD -> Multiply, add, and DOUTD (i.e. DMA from that address).
        *          DMA source addr = src0 * src1 + src2
        *          DMA params = src3
        *
        * In the PDS program we setup src0 with the binding's stride and src1
        * with either the instance id or vertex id (both of which get filled by
        * the hardware). We setup src2 later on once we know which VkBuffer to
        * DMA the data from so it's saved for later when we patch the data
        * section.
        */

       /* TODO: Right now we're setting up a DMA per attribute. In a case where
        * there are multiple attributes packed into a single binding with
        * adjacent locations we'd still be DMAing them separately. This is not
        * great so the DMA setup should be smarter and could do with some
        * optimization.
        */

       *dma_desc = (struct pvr_pds_vertex_dma){ 0 };

       /* In relation to the Vulkan spec. 22.4. Vertex Input Address Calculation
        * this corresponds to `attribDesc.offset`.
        * The PDS program doesn't do anything with it but just save it in the
        * PDS program entry.
        */
       dma_desc->offset = attribute->offset;

       /* In relation to the Vulkan spec. 22.4. Vertex Input Address Calculation
        * this corresponds to `bindingDesc.stride`.
        * The PDS program will calculate the `effectiveVertexOffset` with this
        * and add it to the address provided in the patched data segment.
        */
       dma_desc->stride = binding->stride;

       if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
          dma_desc->flags = PVR_PDS_VERTEX_DMA_FLAGS_INSTANCE_RATE;
       else
          dma_desc->flags = 0;

       /* Size to DMA per vertex attribute. Used to setup src3 in the DDMAD. */
       dma_desc->size_in_dwords = attrib_range->count;

       /* Vtxin reg offset to start DMAing into. */
       dma_desc->destination = attrib_range->start;

       /* Will be used by the driver to figure out buffer address to patch in the
        * data section. I.e. which binding we should DMA from.
        */
       dma_desc->binding_index = attribute->binding;

       /* We don't currently support VK_EXT_vertex_attribute_divisor so no
        * repeating of instance-rate vertex attributes needed. We should always
        * move on to the next vertex attribute.
        */
       assert(binding->inputRate != VK_VERTEX_INPUT_RATE_INSTANCE);
       dma_desc->divisor = 1;

       /* Will be used to generate PDS code that takes care of robust buffer
        * access, and later on by the driver to write the correct robustness
        * buffer address to DMA the fallback values from.
        */
       dma_desc->robustness_buffer_offset =
          pvr_get_robustness_buffer_format_offset(attribute->format);

       /* Used by later on by the driver to figure out if the buffer is being
        * accessed out of bounds, for robust buffer access.
        */
       dma_desc->component_size_in_bytes =
          fmt_description->block.bits / fmt_description->nr_channels / 8;

       ++*dma_count;
    }
 }

 static void pvr_graphics_pipeline_setup_fragment_coeff_program(
    struct pvr_graphics_pipeline *gfx_pipeline,
    nir_shader *fs,
    struct pvr_pds_coeff_loading_program *frag_coeff_program)
 {
    uint64_t varyings_used = fs->info.inputs_read &
                             BITFIELD64_RANGE(VARYING_SLOT_VAR0, MAX_VARYING);
    pco_vs_data *vs_data = &gfx_pipeline->vs_data.vs;
    pco_fs_data *fs_data = &gfx_pipeline->fs_data.fs;

    unsigned fpu = 0;
    unsigned dest = 0;

    if (fs_data->uses.z) {
       pvr_csb_pack (&frag_coeff_program->FPU_iterators[fpu],
                     PDSINST_DOUT_FIELDS_DOUTI_SRC,
                     douti_src) {
          /* TODO: define instead of sizeof(uint16_t). */
          douti_src.f32_offset = fs_data->uses.w ? 1 * sizeof(uint16_t) : 0;
          douti_src.f16_offset = douti_src.f32_offset;
          douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD;
          douti_src.size = ROGUE_PDSINST_DOUTI_SIZE_1D;
       }

       frag_coeff_program->destination[fpu++] = dest++;
    }

    if (fs_data->uses.w) {
       pvr_csb_pack (&frag_coeff_program->FPU_iterators[fpu],
                     PDSINST_DOUT_FIELDS_DOUTI_SRC,
                     douti_src) {
          douti_src.f32_offset = 0;
          douti_src.f16_offset = douti_src.f32_offset;
          douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD;
          douti_src.size = ROGUE_PDSINST_DOUTI_SIZE_1D;
       }

       frag_coeff_program->destination[fpu++] = dest++;
    }

    if (fs_data->uses.pntc) {
       pvr_csb_pack (&frag_coeff_program->FPU_iterators[fpu],
                     PDSINST_DOUT_FIELDS_DOUTI_SRC,
                     douti_src) {
          douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD;
          douti_src.size = ROGUE_PDSINST_DOUTI_SIZE_2D;
          douti_src.pointsprite = true;
       }

       frag_coeff_program->destination[fpu++] = dest;
       dest += 2;
    }

    u_foreach_bit64 (varying, varyings_used) {
       nir_variable *var =
          nir_find_variable_with_location(fs, nir_var_shader_in, varying);
       assert(var);

       pco_range *cf_range = &fs_data->varyings[varying];
       assert(cf_range->count > 0);
       assert(!(cf_range->start % ROGUE_USC_COEFFICIENT_SET_SIZE));
       assert(!(cf_range->count % ROGUE_USC_COEFFICIENT_SET_SIZE));

       pco_range *vtxout_range = &vs_data->varyings[varying];
       assert(vtxout_range->count > 0);
       assert(vtxout_range->start >= 4);

       assert(vtxout_range->count ==
              cf_range->count / ROGUE_USC_COEFFICIENT_SET_SIZE);

       unsigned count = vtxout_range->count;

       unsigned vtxout = vtxout_range->start;

       /* pos.x, pos.y unused. */
       vtxout -= 2;

       /* pos.z unused. */
       if (!fs_data->uses.z)
          vtxout -= 1;

       /* pos.w unused. */
       if (!fs_data->uses.w)
          vtxout -= 1;

       pvr_csb_pack (&frag_coeff_program->FPU_iterators[fpu],
                     PDSINST_DOUT_FIELDS_DOUTI_SRC,
                     douti_src) {
          /* TODO: define instead of sizeof(uint16_t). */
          douti_src.f32_offset = vtxout * sizeof(uint16_t);
          /* TODO: f16 support. */
          douti_src.f16 = false;
          douti_src.f16_offset = douti_src.f32_offset;

          switch (var->data.interpolation) {
          case INTERP_MODE_SMOOTH:
             douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD;
             douti_src.perspective = true;
             break;

          case INTERP_MODE_NOPERSPECTIVE:
             douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD;
             break;

          case INTERP_MODE_FLAT:
             /* TODO: triangle fan, provoking vertex last. */
             douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_FLAT_VERTEX0;
             break;

          default:
             unreachable("Unimplemented interpolation type.");
          }

          douti_src.size = ROGUE_PDSINST_DOUTI_SIZE_1D + count - 1;
       }

       frag_coeff_program->destination[fpu++] =
          cf_range->start / ROGUE_USC_COEFFICIENT_SET_SIZE;
    }

    frag_coeff_program->num_fpu_iterators = fpu;
 }

 static void set_var(pco_range *allocation_list,
                     unsigned to,
                     nir_variable *var,
                     unsigned dwords_each)
 {
    unsigned slots = glsl_count_dword_slots(var->type, false);

    allocation_list[var->data.location] = (pco_range){
       .start = to,
       .count = slots * dwords_each,
    };
 }

 static void allocate_var(pco_range *allocation_list,
                          unsigned *counter,
                          nir_variable *var,
                          unsigned dwords_each)
 {
    unsigned slots = glsl_count_dword_slots(var->type, false);

    allocation_list[var->data.location] = (pco_range){
       .start = *counter,
       .count = slots * dwords_each,
    };

    *counter += slots * dwords_each;
 }

 static void try_allocate_var(pco_range *allocation_list,
                              unsigned *counter,
                              nir_shader *nir,
                              uint64_t bitset,
                              nir_variable_mode mode,
                              int location,
                              unsigned dwords_each)
 {
    nir_variable *var = nir_find_variable_with_location(nir, mode, location);

    if (!(bitset & BITFIELD64_BIT(location)))
       return;

    assert(var);

    allocate_var(allocation_list, counter, var, dwords_each);
 }

 static void try_allocate_vars(pco_range *allocation_list,
                               unsigned *counter,
                               nir_shader *nir,
                               uint64_t *bitset,
                               nir_variable_mode mode,
                               bool f16,
                               enum glsl_interp_mode interp_mode,
                               unsigned dwords_each)
 {
    uint64_t skipped = 0;

    while (*bitset) {
       int location = u_bit_scan64(bitset);

       nir_variable *var = nir_find_variable_with_location(nir, mode, location);
       assert(var);

       if (glsl_type_is_16bit(glsl_without_array_or_matrix(var->type)) != f16 ||
           var->data.interpolation != interp_mode) {
          skipped |= BITFIELD64_BIT(location);
          continue;
       }

       allocate_var(allocation_list, counter, var, dwords_each);
    }

    *bitset |= skipped;
 }

 static void allocate_val(pco_range *allocation_list,
                          unsigned *counter,
                          unsigned location,
                          unsigned dwords_each)
 {
    allocation_list[location] = (pco_range){
       .start = *counter,
       .count = dwords_each,
    };

    *counter += dwords_each;
 }

 static void pvr_alloc_vs_sysvals(pco_data *data, nir_shader *nir)
 {
    BITSET_DECLARE(system_values_read, SYSTEM_VALUE_MAX);
    BITSET_COPY(system_values_read, nir->info.system_values_read);

    gl_system_value sys_vals[] = {
       SYSTEM_VALUE_VERTEX_ID,     SYSTEM_VALUE_INSTANCE_ID,
       SYSTEM_VALUE_BASE_INSTANCE, SYSTEM_VALUE_BASE_VERTEX,
       SYSTEM_VALUE_DRAW_ID,
    };

    for (unsigned u = 0; u < ARRAY_SIZE(sys_vals); ++u) {
       if (BITSET_TEST(system_values_read, sys_vals[u])) {
          nir_intrinsic_op op = nir_intrinsic_from_system_value(sys_vals[u]);
          unsigned dwords = nir_intrinsic_infos[op].dest_components;
          assert(dwords > 0);

          allocate_val(data->common.sys_vals,
                       &data->common.vtxins,
                       sys_vals[u],
                       dwords);

          BITSET_CLEAR(system_values_read, sys_vals[u]);
       }
    }

    assert(BITSET_IS_EMPTY(system_values_read));
 }

 static void pvr_init_vs_attribs(
    pco_data *data,
    const VkPipelineVertexInputStateCreateInfo *const vertex_input_state)
 {
    for (unsigned u = 0; u < vertex_input_state->vertexAttributeDescriptionCount;
         ++u) {
       const VkVertexInputAttributeDescription *attrib =
          &vertex_input_state->pVertexAttributeDescriptions[u];

       gl_vert_attrib location = attrib->location + VERT_ATTRIB_GENERIC0;

       data->vs.attrib_formats[location] =
          vk_format_to_pipe_format(attrib->format);
    }
 }

 static void pvr_alloc_vs_attribs(pco_data *data, nir_shader *nir)
 {
    /* TODO NEXT: this should be based on the format size. */
    nir_foreach_shader_in_variable (var, nir) {
       allocate_var(data->vs.attribs, &data->common.vtxins, var, 1);
    }
 }

 static void pvr_alloc_vs_varyings(pco_data *data, nir_shader *nir)
 {
    uint64_t vars_mask = nir->info.outputs_written &
                         BITFIELD64_RANGE(VARYING_SLOT_VAR0, MAX_VARYING);

    /* Output position must be present. */
    assert(nir_find_variable_with_location(nir,
                                           nir_var_shader_out,
                                           VARYING_SLOT_POS));

    /* Varying ordering is specific. */
    try_allocate_var(data->vs.varyings,
                     &data->vs.vtxouts,
                     nir,
                     nir->info.outputs_written,
                     nir_var_shader_out,
                     VARYING_SLOT_POS,
                     1);

    /* Save varying counts. */
    u_foreach_bit64 (location, vars_mask) {
       nir_variable *var =
          nir_find_variable_with_location(nir, nir_var_shader_out, location);
       assert(var);

       /* TODO: f16 support. */
       bool f16 = glsl_type_is_16bit(glsl_without_array_or_matrix(var->type));
       assert(!f16);
       unsigned components = glsl_get_components(var->type);

       switch (var->data.interpolation) {
       case INTERP_MODE_SMOOTH:
          if (f16)
             data->vs.f16_smooth += components;
          else
             data->vs.f32_smooth += components;

          break;

       case INTERP_MODE_FLAT:
          if (f16)
             data->vs.f16_flat += components;
          else
             data->vs.f32_flat += components;

          break;

       case INTERP_MODE_NOPERSPECTIVE:
          if (f16)
             data->vs.f16_npc += components;
          else
             data->vs.f32_npc += components;

          break;

       default:
          unreachable("");
       }
    }

    for (unsigned f16 = 0; f16 <= 1; ++f16) {
       for (enum glsl_interp_mode interp_mode = INTERP_MODE_SMOOTH;
            interp_mode <= INTERP_MODE_NOPERSPECTIVE;
            ++interp_mode) {
          try_allocate_vars(data->vs.varyings,
                            &data->vs.vtxouts,
                            nir,
                            &vars_mask,
                            nir_var_shader_out,
                            f16,
                            interp_mode,
                            1);
       }
    }

    assert(!vars_mask);

    const gl_varying_slot last_slots[] = {
       VARYING_SLOT_PSIZ,
       VARYING_SLOT_VIEWPORT,
       VARYING_SLOT_LAYER,
    };

    for (unsigned u = 0; u < ARRAY_SIZE(last_slots); ++u) {
       try_allocate_var(data->vs.varyings,
                        &data->vs.vtxouts,
                        nir,
                        nir->info.outputs_written,
                        nir_var_shader_out,
                        last_slots[u],
                        1);
    }
 }

 static void pvr_alloc_fs_sysvals(pco_data *data, nir_shader *nir)
 {
    /* TODO */
 }

 static void pvr_alloc_fs_varyings(pco_data *data, nir_shader *nir)
 {
    assert(!data->common.coeffs);

    /* Save the z/w locations. */
    unsigned zw_count = !!data->fs.uses.z + !!data->fs.uses.w;
    allocate_val(data->fs.varyings,
                 &data->common.coeffs,
                 VARYING_SLOT_POS,
                 zw_count * ROGUE_USC_COEFFICIENT_SET_SIZE);

    /* If point coords are used, they come after z/w (if present). */
    nir_variable *var = nir_find_variable_with_location(nir,
                                                        nir_var_shader_in,
                                                        VARYING_SLOT_PNTC);
    if (var) {
       assert(!var->data.location_frac);
       unsigned count = glsl_get_components(var->type);
       assert(count == 2);

       allocate_var(data->fs.varyings,
                    &data->common.coeffs,
                    var,
                    ROGUE_USC_COEFFICIENT_SET_SIZE);

       data->fs.uses.pntc = true;
    }

    /* Allocate the rest of the input varyings. */
    nir_foreach_shader_in_variable (var, nir) {
       /* Already handled. */
       if (var->data.location == VARYING_SLOT_POS ||
           var->data.location == VARYING_SLOT_PNTC)
          continue;

       allocate_var(data->fs.varyings,
                    &data->common.coeffs,
                    var,
                    ROGUE_USC_COEFFICIENT_SET_SIZE);
    }
 }

 static void
 pvr_init_fs_outputs(pco_data *data,
                     const struct pvr_render_pass *pass,
                     const struct pvr_render_subpass *const subpass,
                     const struct pvr_renderpass_hwsetup_subpass *hw_subpass)
 {
    for (unsigned u = 0; u < subpass->color_count; ++u) {
       unsigned idx = subpass->color_attachments[u];
       if (idx == VK_ATTACHMENT_UNUSED)
          continue;

       gl_frag_result location = FRAG_RESULT_DATA0 + u;
       VkFormat vk_format = pass->attachments[idx].vk_format;
       data->fs.output_formats[location] = vk_format_to_pipe_format(vk_format);
    }

    /* TODO: z-replicate. */
 }

 static void
 pvr_setup_fs_outputs(pco_data *data,
                      nir_shader *nir,
                      const struct pvr_render_subpass *const subpass,
                      const struct pvr_renderpass_hwsetup_subpass *hw_subpass)
 {
    ASSERTED unsigned num_outputs = hw_subpass->setup.num_render_targets;
    assert(num_outputs == subpass->color_count);

    uint64_t outputs_written = nir->info.outputs_written;
    assert(util_bitcount64(outputs_written) == num_outputs);

    for (unsigned u = 0; u < subpass->color_count; ++u) {
       gl_frag_result location = FRAG_RESULT_DATA0 + u;
       unsigned idx = subpass->color_attachments[u];
       const struct usc_mrt_resource *mrt_resource;
       ASSERTED bool output_reg;
       enum pipe_format format;
       unsigned format_bits;
       nir_variable *var;

       if (idx == VK_ATTACHMENT_UNUSED)
          continue;

       assert(u == idx); /* TODO: not sure if this is true or not... */

       mrt_resource = &hw_subpass->setup.mrt_resources[u];
       output_reg = mrt_resource->type == USC_MRT_RESOURCE_TYPE_OUTPUT_REG;

       assert(output_reg);
       /* TODO: tile buffer support. */

       var = nir_find_variable_with_location(nir, nir_var_shader_out, location);
       assert(var);

       format = data->fs.output_formats[location];
       format_bits = util_format_get_blocksizebits(format);
       /* TODO: other sized formats. */
       assert(!(format_bits % 32));

       assert(mrt_resource->intermediate_size == format_bits / 8);

       set_var(data->fs.outputs,
               mrt_resource->reg.output_reg,
               var,
               format_bits / 32);
       data->fs.output_reg[location] = output_reg;

       outputs_written &= ~BITFIELD64_BIT(location);
    }

    /* TODO: z-replicate. */

    assert(!outputs_written);
 }

 static void pvr_init_fs_input_attachments(
    pco_data *data,
    const struct pvr_render_subpass *const subpass,
    const struct pvr_renderpass_hwsetup_subpass *hw_subpass)
 {
    pvr_finishme("pvr_init_fs_input_attachments");
 }

 static void pvr_setup_fs_input_attachments(
    pco_data *data,
    nir_shader *nir,
    const struct pvr_render_subpass *const subpass,
    const struct pvr_renderpass_hwsetup_subpass *hw_subpass)
 {
    pvr_finishme("pvr_setup_fs_input_attachments");
 }

 static void pvr_init_descriptors(pco_data *data,
                                  nir_shader *nir,
                                  struct vk_pipeline_layout *layout)
 {
    for (unsigned desc_set = 0; desc_set < layout->set_count; ++desc_set) {
       const struct pvr_descriptor_set_layout *set_layout =
          vk_to_pvr_descriptor_set_layout(layout->set_layouts[desc_set]);
       pco_descriptor_set_data *desc_set_data =
          &data->common.desc_sets[desc_set];

       /* If the descriptor set isn't for this stage, skip it. */
       if (!(BITFIELD_BIT(nir->info.stage) & set_layout->stage_flags))
          continue;

       desc_set_data->binding_count = set_layout->binding_count;
       desc_set_data->bindings =
          rzalloc_array_size(NULL,
                             sizeof(*desc_set_data->bindings),
                             set_layout->binding_count);
    }
 }

 static void pvr_setup_descriptors(pco_data *data,
                                   nir_shader *nir,
                                   struct vk_pipeline_layout *layout)
 {
    gl_shader_stage stage = nir->info.stage;

    for (unsigned desc_set = 0; desc_set < layout->set_count; ++desc_set) {
       const struct pvr_descriptor_set_layout *set_layout =
          vk_to_pvr_descriptor_set_layout(layout->set_layouts[desc_set]);
       const unsigned desc_set_size_dw = set_layout->size / sizeof(uint32_t);
       pco_descriptor_set_data *desc_set_data =
          &data->common.desc_sets[desc_set];
       pco_range *desc_set_range = &desc_set_data->range;

       assert(!(set_layout->size % sizeof(uint32_t)));

       /* If the descriptor set isn't for this stage or is unused, skip it. */
       if (!(BITFIELD_BIT(stage) & set_layout->stage_flags)) {
          assert(!desc_set_data->used);
          continue;
       }

       if (!desc_set_data->used)
          continue;

       desc_set_range->start = data->common.shareds;
       desc_set_range->count = desc_set_size_dw;
       data->common.shareds += desc_set_size_dw;

       for (unsigned binding = 0; binding < set_layout->binding_count;
            ++binding) {
          const struct pvr_descriptor_set_layout_binding *layout_binding =
             &set_layout->bindings[binding];
          pco_binding_data *binding_data = &desc_set_data->bindings[binding];

          binding_data->range = (pco_range){
             .start = desc_set_range->start +
                      (layout_binding->offset / sizeof(uint32_t)),
             .count =
                (layout_binding->stride * layout_binding->descriptor_count) /
                sizeof(uint32_t),
             .stride = layout_binding->stride / sizeof(uint32_t),
          };
       }
    }
    assert(data->common.shareds < 256);
 }

 static void
 pvr_preprocess_shader_data(pco_data *data,
                            nir_shader *nir,
                            const VkGraphicsPipelineCreateInfo *pCreateInfo,
                            struct vk_pipeline_layout *layout)
 {
    switch (nir->info.stage) {
    case MESA_SHADER_VERTEX: {
       const VkPipelineVertexInputStateCreateInfo *const vertex_input_state =
          pCreateInfo->pVertexInputState;

       pvr_init_vs_attribs(data, vertex_input_state);
       break;
    }

    case MESA_SHADER_FRAGMENT: {
       PVR_FROM_HANDLE(pvr_render_pass, pass, pCreateInfo->renderPass);
       const struct pvr_render_subpass *const subpass =
          &pass->subpasses[pCreateInfo->subpass];
       const struct pvr_renderpass_hw_map *subpass_map =
          &pass->hw_setup->subpass_map[pCreateInfo->subpass];
       const struct pvr_renderpass_hwsetup_subpass *hw_subpass =
          &pass->hw_setup->renders[subpass_map->render]
              .subpasses[subpass_map->subpass];

       pvr_init_fs_outputs(data, pass, subpass, hw_subpass);
       pvr_init_fs_input_attachments(data, subpass, hw_subpass);

       /* TODO: push consts, blend consts, dynamic state, etc. */
       break;
    }

    default:
       unreachable("");
    }

    pvr_init_descriptors(data, nir, layout);

    /* TODO: common things, like large constants being put into shareds. */
 }

 static void
 pvr_postprocess_shader_data(pco_data *data,
                             nir_shader *nir,
                             const VkGraphicsPipelineCreateInfo *pCreateInfo,
                             struct vk_pipeline_layout *layout)
 {
    switch (nir->info.stage) {
    case MESA_SHADER_VERTEX: {
       pvr_alloc_vs_sysvals(data, nir);
       pvr_alloc_vs_attribs(data, nir);
       pvr_alloc_vs_varyings(data, nir);
       break;
    }

    case MESA_SHADER_FRAGMENT: {
       PVR_FROM_HANDLE(pvr_render_pass, pass, pCreateInfo->renderPass);
       const struct pvr_render_subpass *const subpass =
          &pass->subpasses[pCreateInfo->subpass];
       const struct pvr_renderpass_hw_map *subpass_map =
          &pass->hw_setup->subpass_map[pCreateInfo->subpass];
       const struct pvr_renderpass_hwsetup_subpass *hw_subpass =
          &pass->hw_setup->renders[subpass_map->render]
              .subpasses[subpass_map->subpass];

       pvr_alloc_fs_sysvals(data, nir);
       pvr_alloc_fs_varyings(data, nir);
       pvr_setup_fs_outputs(data, nir, subpass, hw_subpass);
       pvr_setup_fs_input_attachments(data, nir, subpass, hw_subpass);

       /* TODO: push consts, blend consts, dynamic state, etc. */
       break;
    }

    default:
       unreachable("");
    }

    pvr_setup_descriptors(data, nir, layout);

    /* TODO: common things, like large constants being put into shareds. */
 }

 /* Compiles and uploads shaders and PDS programs. */
 static VkResult
 pvr_graphics_pipeline_compile(struct pvr_device *const device,
                               struct vk_pipeline_cache *cache,
                               const VkGraphicsPipelineCreateInfo *pCreateInfo,
                               const VkAllocationCallbacks *const allocator,
                               struct pvr_graphics_pipeline *const gfx_pipeline)
 {
    struct vk_pipeline_layout *layout = gfx_pipeline->base.layout;
    const uint32_t cache_line_size =
       rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
    VkResult result;

    struct pvr_vertex_shader_state *vertex_state =
       &gfx_pipeline->shader_state.vertex;
    struct pvr_fragment_shader_state *fragment_state =
       &gfx_pipeline->shader_state.fragment;

    pco_ctx *pco_ctx = device->pdevice->pco_ctx;

    nir_shader *producer = NULL;
    nir_shader *consumer = NULL;
    pco_data shader_data[MESA_SHADER_STAGES] = { 0 };
    nir_shader *nir_shaders[MESA_SHADER_STAGES] = { 0 };
    pco_shader *pco_shaders[MESA_SHADER_STAGES] = { 0 };
    pco_shader **vs = &pco_shaders[MESA_SHADER_VERTEX];
    pco_shader **fs = &pco_shaders[MESA_SHADER_FRAGMENT];
    void *shader_mem_ctx = ralloc_context(NULL);

    struct pvr_pds_vertex_dma vtx_dma_descriptions[PVR_MAX_VERTEX_ATTRIB_DMAS];
    uint32_t vtx_dma_count = 0;

    struct pvr_pds_coeff_loading_program frag_coeff_program = { 0 };

    for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
       size_t stage_index = gfx_pipeline->stage_indices[stage];

       /* Skip unused/inactive stages. */
       if (stage_index == ~0)
          continue;

       result =
          vk_pipeline_shader_stage_to_nir(&device->vk,
                                          gfx_pipeline->base.pipeline_flags,
                                          &pCreateInfo->pStages[stage_index],
                                          pco_spirv_options(),
                                          pco_nir_options(),
                                          shader_mem_ctx,
                                          &nir_shaders[stage]);
       if (result != VK_SUCCESS)
          goto err_free_build_context;

       pco_preprocess_nir(pco_ctx, nir_shaders[stage]);
    }

    for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
       if (!nir_shaders[stage])
          continue;

       if (producer)
          pco_link_nir(pco_ctx, producer, nir_shaders[stage]);

       producer = nir_shaders[stage];
    }

    for (gl_shader_stage stage = MESA_SHADER_STAGES; stage-- > 0;) {
       if (!nir_shaders[stage])
          continue;

       if (consumer)
          pco_rev_link_nir(pco_ctx, nir_shaders[stage], consumer);

       consumer = nir_shaders[stage];
    }

    for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
       if (!nir_shaders[stage])
          continue;

       pvr_preprocess_shader_data(&shader_data[stage],
                                  nir_shaders[stage],
                                  pCreateInfo,
                                  layout);

       pco_lower_nir(pco_ctx, nir_shaders[stage], &shader_data[stage]);

       pco_postprocess_nir(pco_ctx, nir_shaders[stage], &shader_data[stage]);

       pvr_postprocess_shader_data(&shader_data[stage],
                                   nir_shaders[stage],
                                   pCreateInfo,
                                   layout);
    }

    for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
       pco_shader **pco = &pco_shaders[stage];

       /* Skip unused/inactive stages. */
       if (!nir_shaders[stage])
          continue;

       *pco = pco_trans_nir(pco_ctx,
                            nir_shaders[stage],
                            &shader_data[stage],
                            shader_mem_ctx);
       if (!*pco) {
          result = VK_ERROR_INITIALIZATION_FAILED;
          goto err_free_build_context;
       }

       pco_process_ir(pco_ctx, *pco);
       pco_encode_ir(pco_ctx, *pco);
    }

    pvr_vertex_state_save(gfx_pipeline, *vs);

    pvr_graphics_pipeline_setup_vertex_dma(gfx_pipeline,
                                           pCreateInfo->pVertexInputState,
                                           vtx_dma_descriptions,
                                           &vtx_dma_count);

    result = pvr_gpu_upload_usc(device,
                                pco_shader_binary_data(*vs),
                                pco_shader_binary_size(*vs),
                                cache_line_size,
                                &vertex_state->bo);
    if (result != VK_SUCCESS)
       goto err_free_build_context;

    if (*fs) {
       pvr_fragment_state_save(gfx_pipeline, *fs);

       pvr_graphics_pipeline_setup_fragment_coeff_program(
          gfx_pipeline,
          nir_shaders[MESA_SHADER_FRAGMENT],
          &frag_coeff_program);

       result = pvr_gpu_upload_usc(device,
                                   pco_shader_binary_data(*fs),
                                   pco_shader_binary_size(*fs),
                                   cache_line_size,
                                   &fragment_state->bo);
       if (result != VK_SUCCESS)
          goto err_free_vertex_bo;

       result = pvr_pds_coeff_program_create_and_upload(device,
                                                        allocator,
                                                        &frag_coeff_program,
                                                        fragment_state);
       if (result != VK_SUCCESS)
          goto err_free_fragment_bo;

       result = pvr_pds_fragment_program_create_and_upload(device,
                                                           allocator,
                                                           *fs,
                                                           fragment_state);
       if (result != VK_SUCCESS)
          goto err_free_coeff_program;

       result = pvr_pds_descriptor_program_create_and_upload(
          device,
          allocator,
          layout,
          MESA_SHADER_FRAGMENT,
          &gfx_pipeline->fs_data,
          &fragment_state->descriptor_state);
       if (result != VK_SUCCESS)
          goto err_free_frag_program;

       /* If not, we need to MAX2() and set
        * `fragment_state->stage_state.pds_temps_count` appropriately.
        */
       assert(fragment_state->descriptor_state.pds_info.temps_required == 0);
    }

    result = pvr_pds_vertex_attrib_programs_create_and_upload(
       device,
       allocator,
       &gfx_pipeline->vs_data,
       vtx_dma_descriptions,
       vtx_dma_count,
       &vertex_state->pds_attrib_programs);
    if (result != VK_SUCCESS)
       goto err_free_frag_descriptor_program;

    result = pvr_pds_descriptor_program_create_and_upload(
       device,
       allocator,
       layout,
       MESA_SHADER_VERTEX,
       &gfx_pipeline->vs_data,
       &vertex_state->descriptor_state);
    if (result != VK_SUCCESS)
       goto err_free_vertex_attrib_program;

    /* FIXME: When the temp_buffer_total_size is non-zero we need to allocate a
     * scratch buffer for both vertex and fragment stage.
     * Figure out the best place to do this.
     */
    /* assert(pvr_pds_descriptor_program_variables.temp_buff_total_size == 0); */
    /* TODO: Implement spilling with the above. */

    ralloc_free(shader_mem_ctx);

    return VK_SUCCESS;

 err_free_vertex_attrib_program:
    for (uint32_t i = 0; i < ARRAY_SIZE(vertex_state->pds_attrib_programs);
         i++) {
       struct pvr_pds_attrib_program *const attrib_program =
          &vertex_state->pds_attrib_programs[i];

       pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program);
    }
 err_free_frag_descriptor_program:
    pvr_pds_descriptor_program_destroy(device,
                                       allocator,
                                       &fragment_state->descriptor_state);
 err_free_frag_program:
    pvr_bo_suballoc_free(fragment_state->pds_fragment_program.pvr_bo);
 err_free_coeff_program:
    pvr_bo_suballoc_free(fragment_state->pds_coeff_program.pvr_bo);
 err_free_fragment_bo:
    pvr_bo_suballoc_free(fragment_state->bo);
 err_free_vertex_bo:
    pvr_bo_suballoc_free(vertex_state->bo);
 err_free_build_context:
    ralloc_free(shader_mem_ctx);
    return result;
 }

 static struct vk_render_pass_state
 pvr_create_renderpass_state(const VkGraphicsPipelineCreateInfo *const info)
 {
    PVR_FROM_HANDLE(pvr_render_pass, pass, info->renderPass);
    const struct pvr_render_subpass *const subpass =
       &pass->subpasses[info->subpass];

    enum vk_rp_attachment_flags attachments = 0;

    assert(info->subpass < pass->subpass_count);

    for (uint32_t i = 0; i < subpass->color_count; i++) {
       if (pass->attachments[subpass->color_attachments[i]].aspects)
          attachments |= MESA_VK_RP_ATTACHMENT_COLOR_0_BIT << i;
    }

    if (subpass->depth_stencil_attachment != VK_ATTACHMENT_UNUSED) {
       VkImageAspectFlags ds_aspects =
          pass->attachments[subpass->depth_stencil_attachment].aspects;
       if (ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
          attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
       if (ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT)
          attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
    }

    return (struct vk_render_pass_state){
       .attachments = attachments,

       /* TODO: This is only needed for VK_KHR_create_renderpass2 (or core 1.2),
        * which is not currently supported.
        */
       .view_mask = 0,
    };
 }

 static VkResult
 pvr_graphics_pipeline_init(struct pvr_device *device,
                            struct vk_pipeline_cache *cache,
                            const VkGraphicsPipelineCreateInfo *pCreateInfo,
                            const VkAllocationCallbacks *allocator,
                            struct pvr_graphics_pipeline *gfx_pipeline)
 {
    struct vk_dynamic_graphics_state *const dynamic_state =
       &gfx_pipeline->dynamic_state;
    const struct vk_render_pass_state rp_state =
       pvr_create_renderpass_state(pCreateInfo);

    struct vk_graphics_pipeline_all_state all_state;
    struct vk_graphics_pipeline_state state = { 0 };

    VkResult result;

    pvr_pipeline_init(device,
                      PVR_PIPELINE_TYPE_GRAPHICS,
                      pCreateInfo->layout,
                      &gfx_pipeline->base);

    result = vk_graphics_pipeline_state_fill(&device->vk,
                                             &state,
                                             pCreateInfo,
                                             &rp_state,
                                             0,
                                             &all_state,
                                             NULL,
                                             0,
                                             NULL);
    if (result != VK_SUCCESS)
       goto err_pipeline_finish;

    vk_dynamic_graphics_state_init(dynamic_state);

    /* Load static state into base dynamic state holder. */
    vk_dynamic_graphics_state_fill(dynamic_state, &state);

    /* The value of ms.rasterization_samples is undefined when
     * rasterizer_discard_enable is set, but we need a specific value.
     * Fill that in here.
     */
    if (state.rs->rasterizer_discard_enable)
       dynamic_state->ms.rasterization_samples = VK_SAMPLE_COUNT_1_BIT;

    memset(gfx_pipeline->stage_indices, ~0, sizeof(gfx_pipeline->stage_indices));

    for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
       VkShaderStageFlagBits vk_stage = pCreateInfo->pStages[i].stage;
       gl_shader_stage gl_stage = vk_to_mesa_shader_stage(vk_stage);
       /* From the Vulkan 1.2.192 spec for VkPipelineShaderStageCreateInfo:
        *
        *    "stage must not be VK_SHADER_STAGE_ALL_GRAPHICS,
        *    or VK_SHADER_STAGE_ALL."
        *
        * So we don't handle that.
        *
        * We also don't handle VK_SHADER_STAGE_TESSELLATION_* and
        * VK_SHADER_STAGE_GEOMETRY_BIT stages as 'tessellationShader' and
        * 'geometryShader' are set to false in the VkPhysicalDeviceFeatures
        * structure returned by the driver.
        */
       switch (pCreateInfo->pStages[i].stage) {
       case VK_SHADER_STAGE_VERTEX_BIT:
       case VK_SHADER_STAGE_FRAGMENT_BIT:
          gfx_pipeline->stage_indices[gl_stage] = i;
          break;
       default:
          unreachable("Unsupported stage.");
       }
    }

    /* Compiles and uploads shaders and PDS programs. */
    result = pvr_graphics_pipeline_compile(device,
                                           cache,
                                           pCreateInfo,
                                           allocator,
                                           gfx_pipeline);
    if (result != VK_SUCCESS)
       goto err_pipeline_finish;

    return VK_SUCCESS;

 err_pipeline_finish:
    pvr_pipeline_finish(device, &gfx_pipeline->base);

    return result;
 }

 /* If allocator == NULL, the internal one will be used. */
 static VkResult
 pvr_graphics_pipeline_create(struct pvr_device *device,
                              struct vk_pipeline_cache *cache,
                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
                              const VkAllocationCallbacks *allocator,
                              VkPipeline *const pipeline_out)
 {
    struct pvr_graphics_pipeline *gfx_pipeline;
    VkResult result;

    gfx_pipeline = vk_zalloc2(&device->vk.alloc,
                              allocator,
                              sizeof(*gfx_pipeline),
                              8,
                              VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
    if (!gfx_pipeline)
       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);

    /* Compiles and uploads shaders and PDS programs too. */
    result = pvr_graphics_pipeline_init(device,
                                        cache,
                                        pCreateInfo,
                                        allocator,
                                        gfx_pipeline);
    if (result != VK_SUCCESS) {
       vk_free2(&device->vk.alloc, allocator, gfx_pipeline);
       return result;
    }

    *pipeline_out = pvr_pipeline_to_handle(&gfx_pipeline->base);

    return VK_SUCCESS;
 }

 VkResult
 pvr_CreateGraphicsPipelines(VkDevice _device,
                             VkPipelineCache pipelineCache,
                             uint32_t createInfoCount,
                             const VkGraphicsPipelineCreateInfo *pCreateInfos,
                             const VkAllocationCallbacks *pAllocator,
                             VkPipeline *pPipelines)
 {
    VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
    PVR_FROM_HANDLE(pvr_device, device, _device);
    VkResult result = VK_SUCCESS;

    for (uint32_t i = 0; i < createInfoCount; i++) {
       const VkResult local_result =
          pvr_graphics_pipeline_create(device,
                                       cache,
                                       &pCreateInfos[i],
                                       pAllocator,
                                       &pPipelines[i]);
       if (local_result != VK_SUCCESS) {
          result = local_result;
          pPipelines[i] = VK_NULL_HANDLE;
       }
    }

    return result;
 }

 /*****************************************************************************
    Other functions
 *****************************************************************************/

 void pvr_DestroyPipeline(VkDevice _device,
                          VkPipeline _pipeline,
                          const VkAllocationCallbacks *pAllocator)
 {
    PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline);
    PVR_FROM_HANDLE(pvr_device, device, _device);

    if (!pipeline)
       return;

    switch (pipeline->type) {
    case PVR_PIPELINE_TYPE_GRAPHICS: {
       struct pvr_graphics_pipeline *const gfx_pipeline =
          to_pvr_graphics_pipeline(pipeline);

       pvr_graphics_pipeline_destroy(device, pAllocator, gfx_pipeline);
       break;
    }

    case PVR_PIPELINE_TYPE_COMPUTE: {
       struct pvr_compute_pipeline *const compute_pipeline =
          to_pvr_compute_pipeline(pipeline);

       pvr_compute_pipeline_destroy(device, pAllocator, compute_pipeline);
       break;
    }

    default:
       unreachable("Unknown pipeline type.");
    }
 }