src/nouveau/vulkan/nvk_shader.c - third_party/mesa - Git at Google

 /*
  * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
  * SPDX-License-Identifier: MIT
  */
 #include "nvk_shader.h"

 #include "nvk_cmd_buffer.h"
 #include "nvk_descriptor_set_layout.h"
 #include "nvk_device.h"
 #include "nvk_mme.h"
 #include "nvk_physical_device.h"
 #include "nvk_sampler.h"
 #include "nvk_shader.h"

 #include "vk_nir_convert_ycbcr.h"
 #include "vk_pipeline.h"
 #include "vk_pipeline_layout.h"
 #include "vk_shader_module.h"
 #include "vk_ycbcr_conversion.h"

 #include "nak.h"
 #include "nir.h"
 #include "nir_builder.h"
 #include "compiler/spirv/nir_spirv.h"

 #include "util/mesa-sha1.h"
 #include "util/u_debug.h"

 #include "cla097.h"
 #include "clb097.h"
 #include "clc597.h"
 #include "nv_push_cl9097.h"
 #include "nv_push_clb197.h"
 #include "nv_push_clc397.h"
 #include "nv_push_clc797.h"

 static void
 shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align)
 {
    assert(glsl_type_is_vector_or_scalar(type));

    uint32_t comp_size = glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
    unsigned length = glsl_get_vector_elements(type);
    *size = comp_size * length, *align = comp_size;
 }

 uint64_t
 nvk_physical_device_compiler_flags(const struct nvk_physical_device *pdev)
 {
    bool no_cbufs = pdev->debug_flags & NVK_DEBUG_NO_CBUF;
    bool use_edb_buffer_views = nvk_use_edb_buffer_views(pdev);
    uint64_t nak_flags = nak_debug_flags(pdev->nak);

    assert(nak_flags <= UINT16_MAX);

    return ((uint64_t)no_cbufs << 12)
       | ((uint64_t)use_edb_buffer_views << 13)
       | (nak_flags << 48);
 }

 static const nir_shader_compiler_options *
 nvk_get_nir_options(struct vk_physical_device *vk_pdev,
                     mesa_shader_stage stage,
                     UNUSED const struct vk_pipeline_robustness_state *rs)
 {
    const struct nvk_physical_device *pdev =
       container_of(vk_pdev, struct nvk_physical_device, vk);
    return nak_nir_options(pdev->nak);
 }

 nir_address_format
 nvk_ubo_addr_format(const struct nvk_physical_device *pdev,
                     const struct vk_pipeline_robustness_state *rs)
 {
    if (nvk_use_bindless_cbuf(&pdev->info)) {
       return nir_address_format_vec2_index_32bit_offset;
    } else if (rs->null_uniform_buffer_descriptor) {
       /* We need bounds checking for null descriptors */
       return nir_address_format_64bit_bounded_global;
    } else {
       switch (rs->uniform_buffers) {
       case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
          return nir_address_format_64bit_global_32bit_offset;
       case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
       case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
          return nir_address_format_64bit_bounded_global;
       default:
          UNREACHABLE("Invalid robust buffer access behavior");
       }
    }
 }

 nir_address_format
 nvk_ssbo_addr_format(const struct nvk_physical_device *pdev,
                     const struct vk_pipeline_robustness_state *rs)
 {
    if (rs->null_storage_buffer_descriptor) {
       /* We need bounds checking for null descriptors */
       return nir_address_format_64bit_bounded_global;
    } else {
       switch (rs->storage_buffers) {
       case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
          return nir_address_format_64bit_global_32bit_offset;
       case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
       case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
          return nir_address_format_64bit_bounded_global;
       default:
          UNREACHABLE("Invalid robust buffer access behavior");
       }
    }
 }

 static struct spirv_to_nir_options
 nvk_get_spirv_options(struct vk_physical_device *vk_pdev,
                       UNUSED mesa_shader_stage stage,
                       const struct vk_pipeline_robustness_state *rs)
 {
    const struct nvk_physical_device *pdev =
       container_of(vk_pdev, struct nvk_physical_device, vk);

    return (struct spirv_to_nir_options) {
       .ssbo_addr_format = nvk_ssbo_addr_format(pdev, rs),
       .phys_ssbo_addr_format = nir_address_format_64bit_global,
       .ubo_addr_format = nvk_ubo_addr_format(pdev, rs),
       .shared_addr_format = nir_address_format_32bit_offset,
       .min_ssbo_alignment = NVK_MIN_SSBO_ALIGNMENT,
       .min_ubo_alignment = nvk_min_cbuf_alignment(&pdev->info),
    };
 }

 static void
 nvk_preprocess_nir(struct vk_physical_device *vk_pdev,
                    nir_shader *nir,
                    UNUSED const struct vk_pipeline_robustness_state *rs)
 {
    const struct nvk_physical_device *pdev =
       container_of(vk_pdev, struct nvk_physical_device, vk);

    nak_preprocess_nir(nir, pdev->nak);

    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
       nir_input_attachment_options ia_opts = {
          .use_ia_coord_intrin = true,
       };
       NIR_PASS(_, nir, nir_lower_input_attachments, &ia_opts);
    }
 }

 static void
 nvk_populate_fs_key(struct nak_fs_key *key,
                     const struct vk_graphics_pipeline_state *state)
 {
    memset(key, 0, sizeof(*key));

    key->sample_info_cb = 0;
    key->sample_locations_offset = nvk_root_descriptor_offset(draw.sample_locations);
    key->sample_masks_offset = nvk_root_descriptor_offset(draw.sample_masks);

    /* Turn underestimate on when no state is availaible or if explicitly set */
    if (state == NULL || state->rs == NULL ||
        state->rs->conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT)
       key->uses_underestimate = true;

    if (state == NULL)
       return;

    if (state->pipeline_flags &
        VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT)
       key->zs_self_dep = true;

    /* We force per-sample interpolation whenever sampleShadingEnable is set
     * regardless of minSampleShading or rasterizationSamples.
     *
     * When sampleShadingEnable is set, few guarantees are made about the
     * location of interpolation of the inputs.  The only real guarantees are
     * that the inputs are interpolated within the pixel and that you get at
     * least `rasterizationSamples * minSampleShading` unique positions.
     * Importantly, it does not require that when `rasterizationSamples *
     * minSampleShading <= 1.0` that those positions are at the fragment
     * center.  Therefore, it's valid to just always do per-sample (which maps
     * to CENTROID on NVIDIA hardware) all the time and let the hardware sort
     * it out based on what we set in HYBRID_ANTI_ALIAS_CONTROL::passes.
     *
     * Also, we set HYBRID_ANTI_ALIAS_CONTROL::centroid at draw time based on
     * `rasterizationSamples * minSampleShading` so it should be per-pixel
     * whenever we're running only a single pass.  However, this would still be
     * correct even if it got interpolated at some other sample.
     *
     * The one caveat here is that we have to be careful about gl_SampleMaskIn.
     * When `nak_fs_key::force_sample_shading = true` we also turn any reads of
     * gl_SampleMaskIn into `1 << gl_SampleID` because the hardware sample mask
     * is actually per-fragment, not per-pass.  We handle this by smashing
     * minSampleShading to 1.0 whenever gl_SampleMaskIn is read.
     */
    const struct vk_multisample_state *ms = state->ms;
    if (ms != NULL && ms->sample_shading_enable)
       key->force_sample_shading = true;
 }

 static void
 nvk_hash_state(struct vk_physical_device *device,
                const struct vk_graphics_pipeline_state *state,
                const struct vk_features *enabled_features,
                VkShaderStageFlags stages,
                blake3_hash blake3_out)
 {
    struct mesa_blake3 blake3_ctx;
    _mesa_blake3_init(&blake3_ctx);
    if (state && (stages & VK_SHADER_STAGE_FRAGMENT_BIT)) {
       struct nak_fs_key key;
       nvk_populate_fs_key(&key, state);
       _mesa_blake3_update(&blake3_ctx, &key, sizeof(key));

       /* This doesn't impact the shader compile but it does go in the
        * nvk_shader and gets [de]serialized along with the binary so we
        * need to hash it.
        */
       if (state->ms && state->ms->sample_shading_enable) {
          _mesa_blake3_update(&blake3_ctx, &state->ms->min_sample_shading,
                              sizeof(state->ms->min_sample_shading));
       }
    }
    _mesa_blake3_final(&blake3_ctx, blake3_out);
 }

 static bool
 lower_load_intrinsic(nir_builder *b, nir_intrinsic_instr *load,
                      UNUSED void *_data)
 {
    switch (load->intrinsic) {
    case nir_intrinsic_load_ubo: {
       b->cursor = nir_before_instr(&load->instr);

       nir_def *index = load->src[0].ssa;
       nir_def *offset = load->src[1].ssa;
       const enum gl_access_qualifier access = nir_intrinsic_access(load);
       const uint32_t align_mul = nir_intrinsic_align_mul(load);
       const uint32_t align_offset = nir_intrinsic_align_offset(load);

       nir_def *val;
       if (load->src[0].ssa->num_components == 1) {
          val = nir_ldc_nv(b, load->num_components, load->def.bit_size,
                            index, offset, .access = access,
                            .align_mul = align_mul,
                            .align_offset = align_offset);
       } else if (load->src[0].ssa->num_components == 2) {
          nir_def *handle = nir_pack_64_2x32(b, load->src[0].ssa);
          val = nir_ldcx_nv(b, load->num_components, load->def.bit_size,
                            handle, offset, .access = access,
                            .align_mul = align_mul,
                            .align_offset = align_offset);
       } else {
          UNREACHABLE("Invalid UBO index");
       }
       nir_def_rewrite_uses(&load->def, val);
       return true;
    }

    case nir_intrinsic_load_global_constant_offset:
    case nir_intrinsic_load_global_constant_bounded: {
       b->cursor = nir_before_instr(&load->instr);

       nir_def *base_addr = load->src[0].ssa;
       nir_def *offset = load->src[1].ssa;

       nir_def *zero = NULL;
       if (load->intrinsic == nir_intrinsic_load_global_constant_bounded) {
          nir_def *bound = load->src[2].ssa;

          unsigned bit_size = load->def.bit_size;
          assert(bit_size >= 8 && bit_size % 8 == 0);
          unsigned byte_size = bit_size / 8;

          zero = nir_imm_zero(b, load->num_components, bit_size);

          unsigned load_size = byte_size * load->num_components;

          nir_def *sat_offset =
             nir_umin(b, offset, nir_imm_int(b, UINT32_MAX - (load_size - 1)));
          nir_def *in_bounds =
             nir_ilt(b, nir_iadd_imm(b, sat_offset, load_size - 1), bound);

          nir_push_if(b, in_bounds);
       }

       nir_def *val =
          nir_build_load_global_constant(b, load->def.num_components,
                                         load->def.bit_size,
                                         nir_iadd(b, base_addr, nir_u2u64(b, offset)),
                                         .align_mul = nir_intrinsic_align_mul(load),
                                         .align_offset = nir_intrinsic_align_offset(load));

       if (load->intrinsic == nir_intrinsic_load_global_constant_bounded) {
          nir_pop_if(b, NULL);
          val = nir_if_phi(b, val, zero);
       }

       nir_def_rewrite_uses(&load->def, val);
       return true;
    }

    default:
       return false;
    }
 }

 struct lower_ycbcr_state {
    uint32_t set_layout_count;
    struct vk_descriptor_set_layout * const *set_layouts;
 };

 static const struct vk_ycbcr_conversion_state *
 lookup_ycbcr_conversion(const void *_state, uint32_t set,
                         uint32_t binding, uint32_t array_index)
 {
    const struct lower_ycbcr_state *state = _state;
    assert(set < state->set_layout_count);
    assert(state->set_layouts[set] != NULL);
    const struct nvk_descriptor_set_layout *set_layout =
       vk_to_nvk_descriptor_set_layout(state->set_layouts[set]);
    assert(binding < set_layout->binding_count);

    const struct nvk_descriptor_set_binding_layout *bind_layout =
       &set_layout->binding[binding];

    if (bind_layout->immutable_samplers == NULL)
       return NULL;

    array_index = MIN2(array_index, bind_layout->array_size - 1);

    const struct nvk_sampler *sampler =
       bind_layout->immutable_samplers[array_index];

    return sampler && sampler->vk.ycbcr_conversion ?
           &sampler->vk.ycbcr_conversion->state : NULL;
 }

 static void
 nvk_lower_nir(struct nvk_device *dev, nir_shader *nir,
               VkShaderCreateFlagsEXT shader_flags,
               const struct vk_pipeline_robustness_state *rs,
               uint32_t set_layout_count,
               struct vk_descriptor_set_layout * const *set_layouts,
               struct nvk_cbuf_map *cbuf_map_out)
 {
    const struct nvk_physical_device *pdev = nvk_device_physical(dev);

    if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
       NIR_PASS(_, nir, nir_lower_patch_vertices,
                nir->info.tess.tcs_vertices_out, NULL);
    }

    const struct lower_ycbcr_state ycbcr_state = {
       .set_layout_count = set_layout_count,
       .set_layouts = set_layouts,
    };
    NIR_PASS(_, nir, nir_vk_lower_ycbcr_tex,
             lookup_ycbcr_conversion, &ycbcr_state);

    nir_lower_compute_system_values_options csv_options = {
       .has_base_workgroup_id = true,
    };
    NIR_PASS(_, nir, nir_lower_compute_system_values, &csv_options);

    /* Lower push constants before lower_descriptors */
    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const,
             nir_address_format_32bit_offset);

    struct nvk_cbuf_map *cbuf_map = NULL;
    if (!(pdev->debug_flags & NVK_DEBUG_NO_CBUF)) {
       cbuf_map = cbuf_map_out;

       /* Large constant support assumes cbufs */
       NIR_PASS(_, nir, nir_opt_large_constants, NULL, 32);
    } else {
       *cbuf_map_out = (struct nvk_cbuf_map) {
          .cbuf_count = 1,
          .cbufs = {
             { .type = NVK_CBUF_TYPE_ROOT_DESC },
          }
       };
    }

    nir_opt_access_options opt_access_options = {
       .is_vulkan = true,
    };
    NIR_PASS(_, nir, nir_opt_access, &opt_access_options);

    /* On Kepler, we have to lower images to addresses */
    if (pdev->info.cls_eng3d < MAXWELL_A)
       NIR_PASS(_, nir, nak_nir_lower_image_addrs, pdev->nak);

    NIR_PASS(_, nir, nvk_nir_lower_descriptors, pdev, shader_flags, rs,
             set_layout_count, set_layouts, cbuf_map);

    if (nvk_use_bindless_cbuf(&pdev->info)) {
       /* On Turing+ where we have bindless cbufs, we use ACCESS_NON_UNIFORM to
        * determine whether or not it's safe to assume a uniform handle so we
        * want to optimize it away whenever possible.
        */
       if (nir_has_non_uniform_access(nir, nir_lower_non_uniform_ubo_access))
          NIR_PASS(_, nir, nir_opt_non_uniform_access);
    }

    if (pdev->info.cls_eng3d < TURING_A) {
       /* NOTE: This does nothing for images on Kepler since those are lowered
        * to suldga/sustga before we get here.  That's fine, though, because
        * our nil_su_info fetches and calculations work fine with non-uniform
        * descriptors.
        */
       struct nir_lower_non_uniform_access_options opts = {
          .types = nir_lower_non_uniform_texture_access |
                   nir_lower_non_uniform_image_access,
          .callback = NULL,
       };
       /* In practice, most shaders do not have non-uniform-qualified accesses
        * thus a cheaper and likely to fail check is run first.
        */
       if (nir_has_non_uniform_access(nir, opts.types)) {
          NIR_PASS(_, nir, nir_opt_non_uniform_access);
          NIR_PASS(_, nir, nir_lower_non_uniform_access, &opts);
       }
    }

    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global,
             nir_address_format_64bit_global);
    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo,
             nvk_ssbo_addr_format(pdev, rs));
    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo,
             nvk_ubo_addr_format(pdev, rs));
    NIR_PASS(_, nir, nir_shader_intrinsics_pass,
             lower_load_intrinsic, nir_metadata_none, NULL);

    NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
             nir_var_mem_shared, shared_var_info);
    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared,
             nir_address_format_32bit_offset);

    if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
       /* QMD::SHARED_MEMORY_SIZE requires an alignment of 256B so it's safe to
        * align everything up to 16B so we can write whole vec4s.
        */
       nir->info.shared_size = align(nir->info.shared_size, 16);
       NIR_PASS(_, nir, nir_zero_initialize_shared_memory,
                nir->info.shared_size, 16);

       /* We need to call lower_compute_system_values again because
        * nir_zero_initialize_shared_memory generates load_invocation_id which
        * has to be lowered to load_invocation_index.
        */
       NIR_PASS(_, nir, nir_lower_compute_system_values, NULL);
    }
 }

 #ifndef NDEBUG
 static void
 nvk_shader_dump(struct nvk_shader *shader)
 {
    unsigned pos;

    if (shader->info.stage != MESA_SHADER_COMPUTE) {
       _debug_printf("dumping HDR for %s shader\n",
                     _mesa_shader_stage_to_string(shader->info.stage));
       for (pos = 0; pos < ARRAY_SIZE(shader->info.hdr); ++pos)
          _debug_printf("HDR[%02"PRIxPTR"] = 0x%08x\n",
                       pos * sizeof(shader->info.hdr[0]), shader->info.hdr[pos]);
    }
    _debug_printf("shader binary code (0x%x bytes):", shader->code_size);
    for (pos = 0; pos < shader->code_size / 4; ++pos) {
       if ((pos % 8) == 0)
          _debug_printf("\n");
       _debug_printf("%08x ", ((const uint32_t *)shader->code_ptr)[pos]);
    }
    _debug_printf("\n");
 }
 #endif

 static VkResult
 nvk_compile_nir(struct nvk_device *dev, nir_shader *nir,
                 VkShaderCreateFlagsEXT shader_flags,
                 const struct vk_pipeline_robustness_state *rs,
                 const struct nak_fs_key *fs_key,
                 struct nvk_shader *shader)
 {
    const struct nvk_physical_device *pdev = nvk_device_physical(dev);

    const bool dump_asm =
       shader_flags & VK_SHADER_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_MESA;

    nir_variable_mode robust2_modes = 0;
    if (rs->uniform_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT)
       robust2_modes |= nir_var_mem_ubo;
    if (rs->storage_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT)
       robust2_modes |= nir_var_mem_ssbo;

    shader->nak = nak_compile_shader(nir, dump_asm, pdev->nak,
                                     robust2_modes, fs_key);
    if (!shader->nak)
       return vk_errorf(pdev, VK_ERROR_UNKNOWN, "Internal compiler error in NAK");

    shader->info = shader->nak->info;
    shader->code_ptr = shader->nak->code;
    shader->code_size = shader->nak->code_size;

    if (nir->constant_data_size > 0) {
       uint32_t data_align = nvk_min_cbuf_alignment(&pdev->info);
       uint32_t data_size = align(nir->constant_data_size, data_align);

       void *data = malloc(data_size);
       if (data == NULL)
          return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);

       memcpy(data, nir->constant_data, nir->constant_data_size);

       assert(nir->constant_data_size <= data_size);
       memset(data + nir->constant_data_size, 0,
              data_size - nir->constant_data_size);

       shader->data_ptr = data;
       shader->data_size = data_size;
    }

    return VK_SUCCESS;
 }

 static VkResult
 nvk_shader_upload(struct nvk_device *dev, struct nvk_shader *shader)
 {
    const struct nvk_physical_device *pdev = nvk_device_physical(dev);

    uint32_t hdr_size = 0;
    if (shader->info.stage != MESA_SHADER_COMPUTE) {
       if (pdev->info.cls_eng3d >= TURING_A)
          hdr_size = TU102_SHADER_HEADER_SIZE;
       else
          hdr_size = GF100_SHADER_HEADER_SIZE;
    }

    /* Fermi   needs 0x40 alignment
     * Kepler+ needs the first instruction to be 0x80 aligned, so we waste 0x30 bytes
     */
    int alignment = pdev->info.cls_eng3d >= KEPLER_A ? 0x80 : 0x40;

    uint32_t total_size = 0;
    if (pdev->info.cls_eng3d >= KEPLER_A &&
        pdev->info.cls_eng3d < TURING_A &&
        hdr_size > 0) {
       /* The instructions are what has to be aligned so we need to start at a
        * small offset (0x30 B) into the upload area.
        */
       total_size = alignment - hdr_size;
    }

    const uint32_t hdr_offset = total_size;
    total_size += hdr_size;

    const uint32_t code_offset = total_size;
    assert(code_offset % alignment == 0);
    total_size += shader->code_size;

    uint32_t data_offset = 0;
    if (shader->data_size > 0) {
       uint32_t cbuf_alignment = nvk_min_cbuf_alignment(&pdev->info);
       alignment = MAX2(alignment, cbuf_alignment);
       total_size = align(total_size, cbuf_alignment);
       data_offset = total_size;
       total_size += shader->data_size;
    }

    char *data = malloc(total_size);
    if (data == NULL)
       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);

    assert(hdr_size <= sizeof(shader->info.hdr));
    memcpy(data + hdr_offset, shader->info.hdr, hdr_size);
    memcpy(data + code_offset, shader->code_ptr, shader->code_size);
    if (shader->data_size > 0)
       memcpy(data + data_offset, shader->data_ptr, shader->data_size);

 #ifndef NDEBUG
    if (debug_get_bool_option("NV50_PROG_DEBUG", false))
       nvk_shader_dump(shader);
 #endif

    VkResult result = nvk_heap_upload(dev, &dev->shader_heap, data,
                                      total_size, alignment,
                                      &shader->upload_addr);
    if (result == VK_SUCCESS) {
       shader->upload_size = total_size;

       shader->hdr_addr = shader->upload_addr + hdr_offset;
       if (pdev->info.cls_eng3d < VOLTA_A) {
          const uint64_t heap_base_addr =
             nvk_heap_contiguous_base_address(&dev->shader_heap);
          assert(shader->upload_addr - heap_base_addr < UINT32_MAX);
          shader->hdr_addr -= heap_base_addr;
       }
       shader->data_addr = shader->upload_addr + data_offset;
    }
    free(data);

    return result;
 }

 uint32_t
 mesa_to_nv9097_shader_type(mesa_shader_stage stage)
 {
    static const uint32_t mesa_to_nv9097[] = {
       [MESA_SHADER_VERTEX]    = NV9097_SET_PIPELINE_SHADER_TYPE_VERTEX,
       [MESA_SHADER_TESS_CTRL] = NV9097_SET_PIPELINE_SHADER_TYPE_TESSELLATION_INIT,
       [MESA_SHADER_TESS_EVAL] = NV9097_SET_PIPELINE_SHADER_TYPE_TESSELLATION,
       [MESA_SHADER_GEOMETRY]  = NV9097_SET_PIPELINE_SHADER_TYPE_GEOMETRY,
       [MESA_SHADER_FRAGMENT]  = NV9097_SET_PIPELINE_SHADER_TYPE_PIXEL,
    };
    assert(stage < ARRAY_SIZE(mesa_to_nv9097));
    return mesa_to_nv9097[stage];
 }

 uint32_t
 nvk_pipeline_bind_group(mesa_shader_stage stage)
 {
    return stage;
 }

 uint16_t
 nvk_max_shader_push_dw(const struct nvk_physical_device *pdev,
                        mesa_shader_stage stage, bool last_vtgm)
 {
    if (stage == MESA_SHADER_COMPUTE)
       return 0;

    uint16_t max_dw_count = 8;

    if (stage == MESA_SHADER_TESS_EVAL)
       max_dw_count += 2;

    if (stage == MESA_SHADER_FRAGMENT)
       max_dw_count += 13;

    if (last_vtgm) {
       max_dw_count += 8;
       max_dw_count += 4 * (5 + (128 / 4));
    }

    return max_dw_count;
 }

 static VkResult
 nvk_shader_fill_push(struct nvk_device *dev,
                      struct nvk_shader *shader,
                      const VkAllocationCallbacks* pAllocator)
 {
    const struct nvk_physical_device *pdev = nvk_device_physical(dev);

    ASSERTED uint16_t max_dw_count = 0;
    uint32_t push_dw[200];
    struct nv_push push, *p = &push;
    nv_push_init(&push, push_dw, ARRAY_SIZE(push_dw),
                 nvk_queue_subchannels_from_engines(NVKMD_ENGINE_3D));

    const uint32_t type = mesa_to_nv9097_shader_type(shader->info.stage);

    /* We always map index == type */
    const uint32_t idx = type;

    max_dw_count += 2;
    P_IMMD(p, NV9097, SET_PIPELINE_SHADER(idx), {
       .enable  = ENABLE_TRUE,
       .type    = type,
    });

    max_dw_count += 3;
    uint64_t addr = shader->hdr_addr;
    if (pdev->info.cls_eng3d >= VOLTA_A) {
       P_MTHD(p, NVC397, SET_PIPELINE_PROGRAM_ADDRESS_A(idx));
       P_NVC397_SET_PIPELINE_PROGRAM_ADDRESS_A(p, idx, addr >> 32);
       P_NVC397_SET_PIPELINE_PROGRAM_ADDRESS_B(p, idx, addr);
    } else {
       assert(addr < 0xffffffff);
       P_IMMD(p, NV9097, SET_PIPELINE_PROGRAM(idx), addr);
    }

    max_dw_count += 3;
    P_MTHD(p, NVC397, SET_PIPELINE_REGISTER_COUNT(idx));
    P_NVC397_SET_PIPELINE_REGISTER_COUNT(p, idx, shader->info.num_gprs);
    P_NVC397_SET_PIPELINE_BINDING(p, idx,
       nvk_pipeline_bind_group(shader->info.stage));

    if (shader->info.stage == MESA_SHADER_TESS_EVAL) {
       max_dw_count += 2;
       P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_TESS_PARAMS));
       P_INLINE_DATA(p, nvk_mme_tess_params(shader->info.ts.domain,
                                            shader->info.ts.spacing,
                                            shader->info.ts.prims));
    }

    if (shader->info.stage == MESA_SHADER_FRAGMENT) {
       max_dw_count += 13;

       P_MTHD(p, NVC397, SET_SUBTILING_PERF_KNOB_A);
       P_NV9097_SET_SUBTILING_PERF_KNOB_A(p, {
          .fraction_of_spm_register_file_per_subtile         = 0x10,
          .fraction_of_spm_pixel_output_buffer_per_subtile   = 0x40,
          .fraction_of_spm_triangle_ram_per_subtile          = 0x16,
          .fraction_of_max_quads_per_subtile                 = 0x20,
       });
       P_NV9097_SET_SUBTILING_PERF_KNOB_B(p, 0x20);

       P_IMMD(p, NV9097, SET_API_MANDATED_EARLY_Z,
              shader->info.fs.early_fragment_tests);

       if (pdev->info.cls_eng3d >= MAXWELL_B) {
          P_IMMD(p, NVB197, SET_POST_Z_PS_IMASK,
                 shader->info.fs.post_depth_coverage);
       } else {
          assert(!shader->info.fs.post_depth_coverage);
       }

       P_IMMD(p, NV9097, SET_ZCULL_BOUNDS, {
          .z_min_unbounded_enable = shader->info.fs.writes_depth,
          .z_max_unbounded_enable = shader->info.fs.writes_depth,
       });

       if (pdev->info.cls_eng3d >= TURING_A) {
          /* From the Vulkan 1.3.297 spec:
           *
           *    "If sample shading is enabled, an implementation must invoke
           *    the fragment shader at least
           *
           *    max( ⌈ minSampleShading × rasterizationSamples ⌉, 1)
           *
           *    times per fragment."
           *
           * The max() here means that, regardless of the actual value of
           * minSampleShading, we need to invoke at least once per pixel,
           * meaning that we need to disable fragment shading rate.  We also
           * need to disable FSR if sample shading is used by the shader.
           */
          P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_SHADING_RATE_CONTROL));
          P_INLINE_DATA(p, nvk_mme_shading_rate_control_sample_shading(
             shader->sample_shading_enable ||
             shader->info.fs.uses_sample_shading));
       }

       float mss = 0;
       if (shader->info.fs.uses_sample_shading) {
          mss = 1;
       } else if (shader->sample_shading_enable) {
          mss = CLAMP(shader->min_sample_shading, 0, 1);
       } else {
          mss = 0;
       }
       P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_ANTI_ALIAS));
       P_INLINE_DATA(p, nvk_mme_anti_alias_min_sample_shading(mss));
    }

    /* Stash this before we do XFB and clip/cull */
    shader->push_dw_count = nv_push_dw_count(&push);
    assert(max_dw_count ==
           nvk_max_shader_push_dw(pdev, shader->info.stage, false));

    if (shader->info.stage != MESA_SHADER_FRAGMENT &&
        shader->info.stage != MESA_SHADER_TESS_CTRL) {
       max_dw_count += 8;

       P_IMMD(p, NV9097, SET_RT_LAYER, {
          .v       = 0,
          .control = shader->info.vtg.writes_layer ?
                     CONTROL_GEOMETRY_SHADER_SELECTS_LAYER :
                     CONTROL_V_SELECTS_LAYER,
       });

       if (pdev->info.cls_eng3d >= AMPERE_B) {
          P_IMMD(p, NVC797, SET_VARIABLE_PIXEL_RATE_SHADING_TABLE_SELECT, {
             .source = shader->info.vtg.writes_vprs_table_index ?
                       SOURCE_FROM_VPRS_TABLE_INDEX :
                       SOURCE_FROM_CONSTANT,
             .source_constant_value = 0,
          });
       }

       const uint8_t clip_enable = shader->info.vtg.clip_enable;
       const uint8_t cull_enable = shader->info.vtg.cull_enable;
       P_IMMD(p, NV9097, SET_USER_CLIP_ENABLE, {
          .plane0 = ((clip_enable | cull_enable) >> 0) & 1,
          .plane1 = ((clip_enable | cull_enable) >> 1) & 1,
          .plane2 = ((clip_enable | cull_enable) >> 2) & 1,
          .plane3 = ((clip_enable | cull_enable) >> 3) & 1,
          .plane4 = ((clip_enable | cull_enable) >> 4) & 1,
          .plane5 = ((clip_enable | cull_enable) >> 5) & 1,
          .plane6 = ((clip_enable | cull_enable) >> 6) & 1,
          .plane7 = ((clip_enable | cull_enable) >> 7) & 1,
       });
       P_IMMD(p, NV9097, SET_USER_CLIP_OP, {
          .plane0 = (cull_enable >> 0) & 1,
          .plane1 = (cull_enable >> 1) & 1,
          .plane2 = (cull_enable >> 2) & 1,
          .plane3 = (cull_enable >> 3) & 1,
          .plane4 = (cull_enable >> 4) & 1,
          .plane5 = (cull_enable >> 5) & 1,
          .plane6 = (cull_enable >> 6) & 1,
          .plane7 = (cull_enable >> 7) & 1,
       });

       struct nak_xfb_info *xfb = &shader->info.vtg.xfb;
       for (uint8_t b = 0; b < ARRAY_SIZE(xfb->attr_count); b++) {
          const uint8_t attr_count = xfb->attr_count[b];

          max_dw_count += 5 + (128 / 4);

          P_MTHD(p, NV9097, SET_STREAM_OUT_CONTROL_STREAM(b));
          P_NV9097_SET_STREAM_OUT_CONTROL_STREAM(p, b, xfb->stream[b]);
          P_NV9097_SET_STREAM_OUT_CONTROL_COMPONENT_COUNT(p, b, attr_count);
          P_NV9097_SET_STREAM_OUT_CONTROL_STRIDE(p, b, xfb->stride[b]);

          if (attr_count > 0) {
             /* upload packed varying indices in multiples of 4 bytes */
             const uint32_t n = DIV_ROUND_UP(attr_count, 4);
             P_MTHD(p, NV9097, SET_STREAM_OUT_LAYOUT_SELECT(b, 0));
             P_INLINE_ARRAY(p, (const uint32_t*)xfb->attr_index[b], n);
          }
       }

       shader->vtgm_push_dw_count = nv_push_dw_count(&push);
       assert(max_dw_count ==
              nvk_max_shader_push_dw(pdev, shader->info.stage, true));
    }

    assert(nv_push_dw_count(&push) <= max_dw_count);
    assert(max_dw_count <= ARRAY_SIZE(push_dw));

    uint16_t dw_count = nv_push_dw_count(&push);
    shader->push_dw =
       vk_zalloc2(&dev->vk.alloc, pAllocator, dw_count * sizeof(*push_dw),
                  sizeof(*push_dw), VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (shader->push_dw == NULL)
       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);

    memcpy(shader->push_dw, push_dw, dw_count * sizeof(*push_dw));

    return VK_SUCCESS;
 }

 static const struct vk_shader_ops nvk_shader_ops;

 static void
 nvk_shader_destroy(struct vk_device *vk_dev,
                    struct vk_shader *vk_shader,
                    const VkAllocationCallbacks* pAllocator)
 {
    struct nvk_device *dev = container_of(vk_dev, struct nvk_device, vk);
    struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);

    vk_free2(&dev->vk.alloc, pAllocator, shader->push_dw);

    if (shader->upload_size > 0) {
       nvk_heap_free(dev, &dev->shader_heap,
                     shader->upload_addr,
                     shader->upload_size);
    }

    if (shader->nak) {
       nak_shader_bin_destroy(shader->nak);
    } else {
       /* This came from deserialize, just free it */
       free((void *)shader->code_ptr);
    }

    free((void *)shader->data_ptr);

    vk_shader_free(&dev->vk, pAllocator, &shader->vk);
 }

 static VkResult
 nvk_compile_shader(struct nvk_device *dev,
                    struct vk_shader_compile_info *info,
                    const struct vk_graphics_pipeline_state *state,
                    const VkAllocationCallbacks* pAllocator,
                    struct vk_shader **shader_out)
 {
    struct nvk_shader *shader;
    VkResult result;

    /* We consume the NIR, regardless of success or failure */
    nir_shader *nir = info->nir;

    shader = vk_shader_zalloc(&dev->vk, &nvk_shader_ops, info->stage,
                              pAllocator, sizeof(*shader));
    if (shader == NULL) {
       ralloc_free(nir);
       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
    }

    nvk_lower_nir(dev, nir, info->flags, info->robustness,
                  info->set_layout_count, info->set_layouts,
                  &shader->cbuf_map);

    struct nak_fs_key fs_key_tmp, *fs_key = NULL;
    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
       nvk_populate_fs_key(&fs_key_tmp, state);
       fs_key = &fs_key_tmp;
    }

    result = nvk_compile_nir(dev, nir, info->flags, info->robustness,
                             fs_key, shader);
    ralloc_free(nir);
    if (result != VK_SUCCESS) {
       nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
       return result;
    }

    if (dev->nvkmd) {
       result = nvk_shader_upload(dev, shader);
       if (result != VK_SUCCESS) {
          nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
          return result;
       }
    }

    if (info->stage == MESA_SHADER_FRAGMENT) {
       if (state != NULL && state->ms != NULL) {
          shader->sample_shading_enable = state->ms->sample_shading_enable;
          if (state->ms->sample_shading_enable)
             shader->min_sample_shading = state->ms->min_sample_shading;
       }
    }

    if (info->stage != MESA_SHADER_COMPUTE && dev->nvkmd) {
       result = nvk_shader_fill_push(dev, shader, pAllocator);
       if (result != VK_SUCCESS) {
          nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
          return result;
       }
    }

    *shader_out = &shader->vk;

    return VK_SUCCESS;
 }

 VkResult
 nvk_compile_nir_shader(struct nvk_device *dev, nir_shader *nir,
                        const VkAllocationCallbacks *alloc,
                        struct nvk_shader **shader_out)
 {
    const struct nvk_physical_device *pdev = nvk_device_physical(dev);

    const struct vk_pipeline_robustness_state rs_none = {
       .uniform_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
       .storage_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
       .images = VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_ROBUST_IMAGE_ACCESS_2_EXT,
    };

    assert(nir->info.stage == MESA_SHADER_COMPUTE);
    if (nir->options == NULL)
       nir->options = nvk_get_nir_options((struct vk_physical_device *)&pdev->vk,
                                          nir->info.stage, &rs_none);

    struct vk_shader_compile_info info = {
       .stage = nir->info.stage,
       .nir = nir,
       .robustness = &rs_none,
    };

    struct vk_shader *shader = NULL;
    VkResult result = nvk_compile_shader(dev, &info, NULL, alloc, &shader);
    if (result != VK_SUCCESS)
       return result;

    *shader_out = container_of(shader, struct nvk_shader, vk);

    return VK_SUCCESS;
 }

 static VkResult
 nvk_compile_shaders(struct vk_device *vk_dev,
                     uint32_t shader_count,
                     struct vk_shader_compile_info *infos,
                     const struct vk_graphics_pipeline_state *state,
                     const struct vk_features *enabled_features,
                     const VkAllocationCallbacks* pAllocator,
                     struct vk_shader **shaders_out)
 {
    struct nvk_device *dev = container_of(vk_dev, struct nvk_device, vk);

    for (uint32_t i = 0; i < shader_count; i++) {
       VkResult result = nvk_compile_shader(dev, &infos[i], state,
                                            pAllocator, &shaders_out[i]);
       if (result != VK_SUCCESS) {
          /* Clean up all the shaders before this point */
          for (uint32_t j = 0; j < i; j++)
             nvk_shader_destroy(&dev->vk, shaders_out[j], pAllocator);

          /* Clean up all the NIR after this point */
          for (uint32_t j = i + 1; j < shader_count; j++)
             ralloc_free(infos[j].nir);

          /* Memset the output array */
          memset(shaders_out, 0, shader_count * sizeof(*shaders_out));

          return result;
       }
    }

    return VK_SUCCESS;
 }

 static VkResult
 nvk_deserialize_shader(struct vk_device *vk_dev,
                        struct blob_reader *blob,
                        uint32_t binary_version,
                        const VkAllocationCallbacks* pAllocator,
                        struct vk_shader **shader_out)
 {
    struct nvk_device *dev = container_of(vk_dev, struct nvk_device, vk);
    struct nvk_shader *shader;
    VkResult result;

    struct nak_shader_info info;
    blob_copy_bytes(blob, &info, sizeof(info));

    struct nvk_cbuf_map cbuf_map;
    blob_copy_bytes(blob, &cbuf_map, sizeof(cbuf_map));

    bool sample_shading_enable;
    blob_copy_bytes(blob, &sample_shading_enable, sizeof(sample_shading_enable));

    float min_sample_shading;
    blob_copy_bytes(blob, &min_sample_shading, sizeof(min_sample_shading));

    const uint32_t code_size = blob_read_uint32(blob);
    const uint32_t data_size = blob_read_uint32(blob);
    if (blob->overrun)
       return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);

    shader = vk_shader_zalloc(&dev->vk, &nvk_shader_ops, info.stage,
                              pAllocator, sizeof(*shader));
    if (shader == NULL)
       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);

    shader->info = info;
    shader->cbuf_map = cbuf_map;
    shader->sample_shading_enable = sample_shading_enable;
    shader->min_sample_shading = min_sample_shading;
    shader->code_size = code_size;
    shader->data_size = data_size;

    shader->code_ptr = malloc(code_size);
    if (shader->code_ptr == NULL) {
       nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
    }

    shader->data_ptr = malloc(data_size);
    if (shader->data_ptr == NULL) {
       nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
    }

    blob_copy_bytes(blob, (void *)shader->code_ptr, shader->code_size);
    blob_copy_bytes(blob, (void *)shader->data_ptr, shader->data_size);
    if (blob->overrun) {
       nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
       return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
    }

    if (dev->nvkmd) {
       result = nvk_shader_upload(dev, shader);
       if (result != VK_SUCCESS) {
          nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
          return result;
       }
    }

    if (info.stage != MESA_SHADER_COMPUTE && dev->nvkmd) {
       result = nvk_shader_fill_push(dev, shader, pAllocator);
       if (result != VK_SUCCESS) {
          nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
          return result;
       }
    }

    *shader_out = &shader->vk;

    return VK_SUCCESS;
 }

 static bool
 nvk_shader_serialize(struct vk_device *vk_dev,
                      const struct vk_shader *vk_shader,
                      struct blob *blob)
 {
    struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);

    /* We can't currently cache assmbly */
    if (shader->nak != NULL && shader->nak->asm_str != NULL)
       return false;

    blob_write_bytes(blob, &shader->info, sizeof(shader->info));
    blob_write_bytes(blob, &shader->cbuf_map, sizeof(shader->cbuf_map));
    blob_write_bytes(blob, &shader->sample_shading_enable,
                     sizeof(shader->sample_shading_enable));
    blob_write_bytes(blob, &shader->min_sample_shading,
                     sizeof(shader->min_sample_shading));

    blob_write_uint32(blob, shader->code_size);
    blob_write_uint32(blob, shader->data_size);
    blob_write_bytes(blob, shader->code_ptr, shader->code_size);
    blob_write_bytes(blob, shader->data_ptr, shader->data_size);

    return !blob->out_of_memory;
 }

 #define WRITE_STR(field, ...) ({                               \
    memset(field, 0, sizeof(field));                            \
    UNUSED int i = snprintf(field, sizeof(field), __VA_ARGS__); \
    assert(i > 0 && i < sizeof(field));                         \
 })

 static VkResult
 nvk_shader_get_executable_properties(
    UNUSED struct vk_device *device,
    const struct vk_shader *vk_shader,
    uint32_t *executable_count,
    VkPipelineExecutablePropertiesKHR *properties)
 {
    struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);
    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out,
                           properties, executable_count);

    vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) {
       props->stages = mesa_to_vk_shader_stage(shader->info.stage);
       props->subgroupSize = 32;
       WRITE_STR(props->name, "%s",
                 _mesa_shader_stage_to_string(shader->info.stage));
       WRITE_STR(props->description, "%s shader",
                 _mesa_shader_stage_to_string(shader->info.stage));
    }

    return vk_outarray_status(&out);
 }

 static VkResult
 nvk_shader_get_executable_statistics(
    UNUSED struct vk_device *device,
    const struct vk_shader *vk_shader,
    uint32_t executable_index,
    uint32_t *statistic_count,
    VkPipelineExecutableStatisticKHR *statistics)
 {
    struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);
    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out,
                           statistics, statistic_count);

    assert(executable_index == 0);

    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
       WRITE_STR(stat->name, "Instruction count");
       WRITE_STR(stat->description, "Number of instructions used by this shader");
       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
       stat->value.u64 = shader->info.num_instrs;
    }

    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
       WRITE_STR(stat->name, "Static cycle count");
       WRITE_STR(stat->description,
                 "Total cycles used by fixed-latency instructions in this shader");
       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
       stat->value.u64 = shader->info.num_static_cycles;
    }

    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
       WRITE_STR(stat->name, "Max warps/SM");
       WRITE_STR(stat->description,
                 "Maximum number of warps per SM based on static information");
       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
       stat->value.u64 = shader->info.max_warps_per_sm;
    }

    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
       WRITE_STR(stat->name, "Spills to memory");
       WRITE_STR(stat->description, "Number of spills from GPRs to memory");
       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
       stat->value.u64 = shader->info.num_spills_to_mem;
    }

    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
       WRITE_STR(stat->name, "Fills from memory");
       WRITE_STR(stat->description, "Number of fills from memory to GPRs");
       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
       stat->value.u64 = shader->info.num_spills_to_mem;
    }

    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
       WRITE_STR(stat->name, "Spills to reg");
       WRITE_STR(stat->description,
                 "Number of spills between different register files");
       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
       stat->value.u64 = shader->info.num_spills_to_reg;
    }

    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
       WRITE_STR(stat->name, "Fills from reg");
       WRITE_STR(stat->description,
                 "Number of fills between different register files");
       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
       stat->value.u64 = shader->info.num_fills_from_reg;
    }

    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
       WRITE_STR(stat->name, "Code size");
       WRITE_STR(stat->description,
                 "Size of the compiled shader binary, in bytes");
       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
       stat->value.u64 = shader->code_size;
    }

    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
       WRITE_STR(stat->name, "Number of GPRs");
       WRITE_STR(stat->description, "Number of GPRs used by this pipeline");
       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
       stat->value.u64 = shader->info.num_gprs;
    }

    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
       WRITE_STR(stat->name, "SLM size");
       WRITE_STR(stat->description,
                 "Size of shader local (scratch) memory, in bytes");
       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
       stat->value.u64 = shader->info.slm_size;
    }

    return vk_outarray_status(&out);
 }

 static bool
 write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
               const char *data)
 {
    ir->isText = VK_TRUE;

    size_t data_len = strlen(data) + 1;

    if (ir->pData == NULL) {
       ir->dataSize = data_len;
       return true;
    }

    strncpy(ir->pData, data, ir->dataSize);
    if (ir->dataSize < data_len)
       return false;

    ir->dataSize = data_len;
    return true;
 }

 static VkResult
 nvk_shader_get_executable_internal_representations(
    UNUSED struct vk_device *device,
    const struct vk_shader *vk_shader,
    uint32_t executable_index,
    uint32_t *internal_representation_count,
    VkPipelineExecutableInternalRepresentationKHR *internal_representations)
 {
    struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);
    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
                           internal_representations,
                           internal_representation_count);
    bool incomplete_text = false;

    assert(executable_index == 0);

    if (shader->nak != NULL && shader->nak->asm_str != NULL) {
       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
          WRITE_STR(ir->name, "NAK assembly");
          WRITE_STR(ir->description, "NAK assembly");
          if (!write_ir_text(ir, shader->nak->asm_str))
             incomplete_text = true;
       }
    }

    return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
 }

 static const struct vk_shader_ops nvk_shader_ops = {
    .destroy = nvk_shader_destroy,
    .serialize = nvk_shader_serialize,
    .get_executable_properties = nvk_shader_get_executable_properties,
    .get_executable_statistics = nvk_shader_get_executable_statistics,
    .get_executable_internal_representations =
       nvk_shader_get_executable_internal_representations,
 };

 const struct vk_device_shader_ops nvk_device_shader_ops = {
    .get_nir_options = nvk_get_nir_options,
    .get_spirv_options = nvk_get_spirv_options,
    .preprocess_nir = nvk_preprocess_nir,
    .hash_state = nvk_hash_state,
    .compile = nvk_compile_shaders,
    .deserialize = nvk_deserialize_shader,
    .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state,
    .cmd_bind_shaders = nvk_cmd_bind_shaders,
 };