| /* |
| * Copyright © 2021 Collabora Ltd. |
| * |
| * Derived from tu_shader.c which is: |
| * Copyright © 2019 Google LLC |
| * |
| * Also derived from anv_pipeline.c which is |
| * Copyright © 2015 Intel Corporation |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
| * DEALINGS IN THE SOFTWARE. |
| */ |
| |
| #include "genxml/gen_macros.h" |
| |
| #include "panvk_cmd_buffer.h" |
| #include "panvk_device.h" |
| #include "panvk_instance.h" |
| #include "panvk_mempool.h" |
| #include "panvk_physical_device.h" |
| #include "panvk_shader.h" |
| |
| #include "spirv/nir_spirv.h" |
| #include "util/memstream.h" |
| #include "util/mesa-sha1.h" |
| #include "util/u_dynarray.h" |
| #include "nir_builder.h" |
| #include "nir_conversion_builder.h" |
| #include "nir_deref.h" |
| |
| #include "vk_graphics_state.h" |
| #include "vk_shader_module.h" |
| |
| #include "compiler/bifrost_nir.h" |
| #include "util/pan_lower_framebuffer.h" |
| #include "pan_shader.h" |
| |
| #include "vk_log.h" |
| #include "vk_pipeline.h" |
| #include "vk_pipeline_layout.h" |
| #include "vk_shader.h" |
| #include "vk_util.h" |
| |
| static nir_def * |
| load_sysval_from_push_const(nir_builder *b, unsigned offset, unsigned bit_size, |
| unsigned num_comps) |
| { |
| return nir_load_push_constant( |
| b, num_comps, bit_size, nir_imm_int(b, 0), |
| /* Push constants are placed first, and then come the sysvals. */ |
| .base = offset + 256, .range = num_comps * bit_size / 8); |
| } |
| |
| static bool |
| panvk_lower_sysvals(nir_builder *b, nir_instr *instr, void *data) |
| { |
| if (instr->type != nir_instr_type_intrinsic) |
| return false; |
| |
| nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); |
| unsigned num_comps = intr->def.num_components; |
| unsigned bit_size = intr->def.bit_size; |
| nir_def *val = NULL; |
| b->cursor = nir_before_instr(instr); |
| |
| #define SYSVAL(ptype, name) offsetof(struct panvk_##ptype##_sysvals, name) |
| switch (intr->intrinsic) { |
| case nir_intrinsic_load_base_workgroup_id: |
| val = load_sysval_from_push_const(b, SYSVAL(compute, base), bit_size, |
| num_comps); |
| break; |
| case nir_intrinsic_load_num_workgroups: |
| val = load_sysval_from_push_const(b, SYSVAL(compute, num_work_groups), |
| bit_size, num_comps); |
| break; |
| case nir_intrinsic_load_workgroup_size: |
| val = load_sysval_from_push_const(b, SYSVAL(compute, local_group_size), |
| bit_size, num_comps); |
| break; |
| case nir_intrinsic_load_viewport_scale: |
| val = load_sysval_from_push_const(b, SYSVAL(graphics, viewport.scale), |
| bit_size, num_comps); |
| break; |
| case nir_intrinsic_load_viewport_offset: |
| val = load_sysval_from_push_const(b, SYSVAL(graphics, viewport.offset), |
| bit_size, num_comps); |
| break; |
| case nir_intrinsic_load_first_vertex: |
| val = load_sysval_from_push_const(b, SYSVAL(graphics, vs.first_vertex), |
| bit_size, num_comps); |
| break; |
| case nir_intrinsic_load_base_vertex: |
| val = load_sysval_from_push_const(b, SYSVAL(graphics, vs.base_vertex), |
| bit_size, num_comps); |
| break; |
| case nir_intrinsic_load_base_instance: |
| val = load_sysval_from_push_const(b, SYSVAL(graphics, vs.base_instance), |
| bit_size, num_comps); |
| break; |
| case nir_intrinsic_load_blend_const_color_rgba: |
| val = load_sysval_from_push_const(b, SYSVAL(graphics, blend.constants), |
| bit_size, num_comps); |
| break; |
| |
| case nir_intrinsic_load_layer_id: |
| assert(b->shader->info.stage == MESA_SHADER_FRAGMENT); |
| val = load_sysval_from_push_const(b, SYSVAL(graphics, layer_id), bit_size, |
| num_comps); |
| break; |
| |
| default: |
| return false; |
| } |
| #undef SYSVAL |
| |
| b->cursor = nir_after_instr(instr); |
| nir_def_rewrite_uses(&intr->def, val); |
| return true; |
| } |
| |
| static bool |
| lower_gl_pos_layer_writes(nir_builder *b, nir_instr *instr, void *data) |
| { |
| if (instr->type != nir_instr_type_intrinsic) |
| return false; |
| |
| nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); |
| |
| if (intr->intrinsic != nir_intrinsic_copy_deref) |
| return false; |
| |
| nir_variable *dst_var = nir_intrinsic_get_var(intr, 0); |
| nir_variable *src_var = nir_intrinsic_get_var(intr, 1); |
| |
| if (!dst_var || dst_var->data.mode != nir_var_shader_out || !src_var || |
| src_var->data.mode != nir_var_shader_temp) |
| return false; |
| |
| if (dst_var->data.location == VARYING_SLOT_LAYER) { |
| /* We don't really write the layer, we just make sure primitives are |
| * discarded if gl_Layer doesn't match the layer passed to the draw. |
| */ |
| b->cursor = nir_instr_remove(instr); |
| return true; |
| } |
| |
| if (dst_var->data.location == VARYING_SLOT_POS) { |
| nir_variable *temp_layer_var = data; |
| nir_variable *temp_pos_var = src_var; |
| |
| b->cursor = nir_before_instr(instr); |
| nir_def *layer = nir_load_var(b, temp_layer_var); |
| nir_def *pos = nir_load_var(b, temp_pos_var); |
| nir_def *inf_pos = nir_imm_vec4(b, INFINITY, INFINITY, INFINITY, 1.0f); |
| nir_def *ref_layer = load_sysval_from_push_const( |
| b, offsetof(struct panvk_graphics_sysvals, layer_id), 32, 1); |
| |
| nir_store_var(b, temp_pos_var, |
| nir_bcsel(b, nir_ieq(b, layer, ref_layer), pos, inf_pos), |
| 0xf); |
| return true; |
| } |
| |
| return false; |
| } |
| |
| static bool |
| lower_layer_writes(nir_shader *nir) |
| { |
| if (nir->info.stage == MESA_SHADER_FRAGMENT) |
| return false; |
| |
| nir_variable *temp_layer_var = NULL; |
| bool has_layer_var = false; |
| |
| nir_foreach_variable_with_modes(var, nir, |
| nir_var_shader_out | nir_var_shader_temp) { |
| if (var->data.mode == nir_var_shader_out && |
| var->data.location == VARYING_SLOT_LAYER) |
| has_layer_var = true; |
| |
| if (var->data.mode == nir_var_shader_temp && |
| var->data.location == VARYING_SLOT_LAYER) |
| temp_layer_var = var; |
| } |
| |
| if (!has_layer_var) |
| return false; |
| |
| assert(temp_layer_var); |
| |
| return nir_shader_instructions_pass( |
| nir, lower_gl_pos_layer_writes, |
| nir_metadata_block_index | nir_metadata_dominance, temp_layer_var); |
| } |
| |
| static void |
| shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align) |
| { |
| assert(glsl_type_is_vector_or_scalar(type)); |
| |
| uint32_t comp_size = |
| glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8; |
| unsigned length = glsl_get_vector_elements(type); |
| *size = comp_size * length, *align = comp_size * (length == 3 ? 4 : length); |
| } |
| |
| static inline nir_address_format |
| panvk_buffer_ubo_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness) |
| { |
| switch (robustness) { |
| case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT: |
| case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT: |
| case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT: |
| return nir_address_format_32bit_index_offset; |
| default: |
| unreachable("Invalid robust buffer access behavior"); |
| } |
| } |
| |
| static inline nir_address_format |
| panvk_buffer_ssbo_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness) |
| { |
| switch (robustness) { |
| case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT: |
| return nir_address_format_64bit_global_32bit_offset; |
| case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT: |
| case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT: |
| return nir_address_format_64bit_bounded_global; |
| default: |
| unreachable("Invalid robust buffer access behavior"); |
| } |
| } |
| |
| static const nir_shader_compiler_options * |
| panvk_get_nir_options(UNUSED struct vk_physical_device *vk_pdev, |
| UNUSED gl_shader_stage stage, |
| UNUSED const struct vk_pipeline_robustness_state *rs) |
| { |
| return GENX(pan_shader_get_compiler_options)(); |
| } |
| |
| static struct spirv_to_nir_options |
| panvk_get_spirv_options(UNUSED struct vk_physical_device *vk_pdev, |
| UNUSED gl_shader_stage stage, |
| const struct vk_pipeline_robustness_state *rs) |
| { |
| return (struct spirv_to_nir_options){ |
| .ubo_addr_format = panvk_buffer_ubo_addr_format(rs->uniform_buffers), |
| .ssbo_addr_format = panvk_buffer_ssbo_addr_format(rs->storage_buffers), |
| .phys_ssbo_addr_format = nir_address_format_64bit_global, |
| }; |
| } |
| |
| static void |
| panvk_preprocess_nir(UNUSED struct vk_physical_device *vk_pdev, nir_shader *nir) |
| { |
| /* Ensure to regroup output variables at the same location */ |
| if (nir->info.stage == MESA_SHADER_FRAGMENT) |
| NIR_PASS_V(nir, nir_lower_io_to_vector, nir_var_shader_out); |
| |
| NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir), |
| true, true); |
| |
| /* This needs to be done just after the io_to_temporaries pass, because we |
| * rely on in/out temporaries to collect the final layer_id value. */ |
| NIR_PASS_V(nir, lower_layer_writes); |
| |
| NIR_PASS_V(nir, nir_lower_indirect_derefs, |
| nir_var_shader_in | nir_var_shader_out, UINT32_MAX); |
| |
| NIR_PASS_V(nir, nir_opt_copy_prop_vars); |
| NIR_PASS_V(nir, nir_opt_combine_stores, nir_var_all); |
| NIR_PASS_V(nir, nir_opt_loop); |
| |
| if (nir->info.stage == MESA_SHADER_FRAGMENT) { |
| struct nir_input_attachment_options lower_input_attach_opts = { |
| .use_fragcoord_sysval = true, |
| .use_layer_id_sysval = true, |
| }; |
| |
| NIR_PASS_V(nir, nir_lower_input_attachments, &lower_input_attach_opts); |
| } |
| |
| /* Do texture lowering here. Yes, it's a duplication of the texture |
| * lowering in bifrost_compile. However, we need to lower texture stuff |
| * now, before we call panvk_per_arch(nir_lower_descriptors)() because some |
| * of the texture lowering generates nir_texop_txs which we handle as part |
| * of descriptor lowering. |
| * |
| * TODO: We really should be doing this in common code, not dpulicated in |
| * panvk. In order to do that, we need to rework the panfrost compile |
| * flow to look more like the Intel flow: |
| * |
| * 1. Compile SPIR-V to NIR and maybe do a tiny bit of lowering that needs |
| * to be done really early. |
| * |
| * 2. pan_preprocess_nir: Does common lowering and runs the optimization |
| * loop. Nothing here should be API-specific. |
| * |
| * 3. Do additional lowering in panvk |
| * |
| * 4. pan_postprocess_nir: Does final lowering and runs the optimization |
| * loop again. This can happen as part of the final compile. |
| * |
| * This would give us a better place to do panvk-specific lowering. |
| */ |
| nir_lower_tex_options lower_tex_options = { |
| .lower_txs_lod = true, |
| .lower_txp = ~0, |
| .lower_tg4_broadcom_swizzle = true, |
| .lower_txd = true, |
| .lower_invalid_implicit_lod = true, |
| }; |
| NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options); |
| NIR_PASS_V(nir, nir_lower_system_values); |
| |
| nir_lower_compute_system_values_options options = { |
| .has_base_workgroup_id = true, |
| }; |
| |
| NIR_PASS_V(nir, nir_lower_compute_system_values, &options); |
| |
| if (nir->info.stage == MESA_SHADER_FRAGMENT) |
| NIR_PASS_V(nir, nir_lower_wpos_center); |
| |
| NIR_PASS_V(nir, nir_split_var_copies); |
| NIR_PASS_V(nir, nir_lower_var_copies); |
| } |
| |
| static void |
| panvk_hash_graphics_state(struct vk_physical_device *device, |
| const struct vk_graphics_pipeline_state *state, |
| VkShaderStageFlags stages, blake3_hash blake3_out) |
| { |
| struct mesa_blake3 blake3_ctx; |
| _mesa_blake3_init(&blake3_ctx); |
| |
| /* We don't need to do anything here yet */ |
| |
| _mesa_blake3_final(&blake3_ctx, blake3_out); |
| } |
| |
| static void |
| panvk_lower_nir(struct panvk_device *dev, nir_shader *nir, |
| uint32_t set_layout_count, |
| struct vk_descriptor_set_layout *const *set_layouts, |
| const struct vk_pipeline_robustness_state *rs, |
| const struct panfrost_compile_inputs *compile_input, |
| struct panvk_shader *shader) |
| { |
| struct panvk_instance *instance = |
| to_panvk_instance(dev->vk.physical->instance); |
| gl_shader_stage stage = nir->info.stage; |
| |
| NIR_PASS_V(nir, panvk_per_arch(nir_lower_descriptors), dev, set_layout_count, |
| set_layouts, shader); |
| |
| NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ubo, |
| panvk_buffer_ubo_addr_format(rs->uniform_buffers)); |
| NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ssbo, |
| panvk_buffer_ssbo_addr_format(rs->storage_buffers)); |
| NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_push_const, |
| nir_address_format_32bit_offset); |
| NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_global, |
| nir_address_format_64bit_global); |
| |
| if (gl_shader_stage_uses_workgroup(stage)) { |
| if (!nir->info.shared_memory_explicit_layout) { |
| NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared, |
| shared_type_info); |
| } |
| |
| NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_shared, |
| nir_address_format_32bit_offset); |
| } |
| |
| if (stage == MESA_SHADER_VERTEX) { |
| /* We need the driver_location to match the vertex attribute location, |
| * so we can use the attribute layout described by |
| * vk_vertex_input_state where there are holes in the attribute locations. |
| */ |
| nir_foreach_shader_in_variable(var, nir) { |
| assert(var->data.location >= VERT_ATTRIB_GENERIC0 && |
| var->data.location <= VERT_ATTRIB_GENERIC15); |
| var->data.driver_location = var->data.location - VERT_ATTRIB_GENERIC0; |
| } |
| } else { |
| nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, |
| stage); |
| } |
| |
| nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, |
| stage); |
| |
| /* Needed to turn shader_temp into function_temp since the backend only |
| * handles the latter for now. |
| */ |
| NIR_PASS_V(nir, nir_lower_global_vars_to_local); |
| |
| nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); |
| if (unlikely(instance->debug_flags & PANVK_DEBUG_NIR)) { |
| fprintf(stderr, "translated nir:\n"); |
| nir_print_shader(nir, stderr); |
| } |
| |
| pan_shader_preprocess(nir, compile_input->gpu_id); |
| |
| if (stage == MESA_SHADER_VERTEX) |
| NIR_PASS_V(nir, pan_lower_image_index, MAX_VS_ATTRIBS); |
| |
| NIR_PASS_V(nir, nir_shader_instructions_pass, panvk_lower_sysvals, |
| nir_metadata_control_flow, NULL); |
| } |
| |
| static VkResult |
| panvk_compile_nir(struct panvk_device *dev, nir_shader *nir, |
| VkShaderCreateFlagsEXT shader_flags, |
| struct panfrost_compile_inputs *compile_input, |
| struct panvk_shader *shader) |
| { |
| const bool dump_asm = |
| shader_flags & VK_SHADER_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_MESA; |
| |
| struct util_dynarray binary; |
| util_dynarray_init(&binary, NULL); |
| GENX(pan_shader_compile)(nir, compile_input, &binary, &shader->info); |
| |
| void *bin_ptr = util_dynarray_element(&binary, uint8_t, 0); |
| unsigned bin_size = util_dynarray_num_elements(&binary, uint8_t); |
| |
| shader->bin_size = 0; |
| shader->bin_ptr = NULL; |
| |
| if (bin_size) { |
| void *data = malloc(bin_size); |
| |
| if (data == NULL) |
| return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); |
| |
| memcpy(data, bin_ptr, bin_size); |
| shader->bin_size = bin_size; |
| shader->bin_ptr = data; |
| } |
| util_dynarray_fini(&binary); |
| |
| if (dump_asm) { |
| shader->nir_str = nir_shader_as_str(nir, NULL); |
| |
| char *data = NULL; |
| size_t disasm_size = 0; |
| |
| if (shader->bin_size) { |
| struct u_memstream mem; |
| if (u_memstream_open(&mem, &data, &disasm_size)) { |
| FILE *const stream = u_memstream_get(&mem); |
| pan_shader_disassemble(stream, shader->bin_ptr, shader->bin_size, |
| compile_input->gpu_id, false); |
| u_memstream_close(&mem); |
| } |
| } |
| |
| char *asm_str = malloc(disasm_size + 1); |
| memcpy(asm_str, data, disasm_size); |
| asm_str[disasm_size] = '\0'; |
| free(data); |
| |
| shader->asm_str = asm_str; |
| } |
| |
| /* Patch the descriptor count */ |
| shader->info.ubo_count = |
| shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_UBO] + |
| shader->desc_info.dyn_ubos.count; |
| shader->info.texture_count = |
| shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_TEXTURE]; |
| shader->info.sampler_count = |
| shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_SAMPLER]; |
| |
| /* Dummy sampler. */ |
| if (!shader->info.sampler_count && shader->info.texture_count) |
| shader->info.sampler_count++; |
| |
| if (nir->info.stage == MESA_SHADER_VERTEX) { |
| /* We leave holes in the attribute locations, but pan_shader.c assumes the |
| * opposite. Patch attribute_count accordingly, so |
| * pan_shader_prepare_rsd() does what we expect. |
| */ |
| uint32_t gen_attribs = |
| (shader->info.attributes_read & VERT_BIT_GENERIC_ALL) >> |
| VERT_ATTRIB_GENERIC0; |
| |
| shader->info.attribute_count = util_last_bit(gen_attribs); |
| |
| /* NULL IDVS shaders are not allowed. */ |
| if (!bin_size) |
| shader->info.vs.idvs = false; |
| } |
| |
| /* Image attributes start at MAX_VS_ATTRIBS in the VS attribute table, |
| * and zero in other stages. |
| */ |
| if (shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_IMG] > 0) |
| shader->info.attribute_count = |
| shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_IMG] + |
| (nir->info.stage == MESA_SHADER_VERTEX ? MAX_VS_ATTRIBS : 0); |
| |
| shader->local_size.x = nir->info.workgroup_size[0]; |
| shader->local_size.y = nir->info.workgroup_size[1]; |
| shader->local_size.z = nir->info.workgroup_size[2]; |
| |
| return VK_SUCCESS; |
| } |
| |
| static VkResult |
| panvk_shader_upload(struct panvk_device *dev, struct panvk_shader *shader, |
| const VkAllocationCallbacks *pAllocator) |
| { |
| shader->code_mem = (struct panvk_priv_mem){0}; |
| shader->rsd = (struct panvk_priv_mem){0}; |
| |
| if (!shader->bin_size) |
| return VK_SUCCESS; |
| |
| shader->code_mem = panvk_pool_upload_aligned( |
| &dev->mempools.exec, shader->bin_ptr, shader->bin_size, 128); |
| |
| if (shader->info.stage == MESA_SHADER_FRAGMENT) |
| return VK_SUCCESS; |
| |
| shader->rsd = panvk_pool_alloc_desc(&dev->mempools.rw, RENDERER_STATE); |
| |
| pan_pack(panvk_priv_mem_host_addr(shader->rsd), RENDERER_STATE, cfg) { |
| pan_shader_prepare_rsd(&shader->info, panvk_shader_get_dev_addr(shader), |
| &cfg); |
| } |
| |
| return VK_SUCCESS; |
| } |
| |
| static void |
| panvk_shader_destroy(struct vk_device *vk_dev, struct vk_shader *vk_shader, |
| const VkAllocationCallbacks *pAllocator) |
| { |
| struct panvk_device *dev = to_panvk_device(vk_dev); |
| struct panvk_shader *shader = |
| container_of(vk_shader, struct panvk_shader, vk); |
| |
| free((void *)shader->asm_str); |
| ralloc_free((void *)shader->nir_str); |
| |
| panvk_pool_free_mem(&dev->mempools.exec, shader->code_mem); |
| panvk_pool_free_mem(&dev->mempools.exec, shader->rsd); |
| panvk_pool_free_mem(&dev->mempools.exec, shader->desc_info.others.map); |
| |
| free((void *)shader->bin_ptr); |
| vk_shader_free(&dev->vk, pAllocator, &shader->vk); |
| } |
| |
| static const struct vk_shader_ops panvk_shader_ops; |
| |
| static VkResult |
| panvk_compile_shader(struct panvk_device *dev, |
| struct vk_shader_compile_info *info, |
| const struct vk_graphics_pipeline_state *state, |
| const VkAllocationCallbacks *pAllocator, |
| struct vk_shader **shader_out) |
| { |
| struct panvk_physical_device *phys_dev = |
| to_panvk_physical_device(dev->vk.physical); |
| |
| struct panvk_shader *shader; |
| VkResult result; |
| |
| /* We consume the NIR, regardless of success or failure */ |
| nir_shader *nir = info->nir; |
| |
| shader = vk_shader_zalloc(&dev->vk, &panvk_shader_ops, info->stage, |
| pAllocator, sizeof(*shader)); |
| if (shader == NULL) |
| return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); |
| |
| struct panfrost_compile_inputs inputs = { |
| .gpu_id = phys_dev->kmod.props.gpu_prod_id, |
| .no_ubo_to_push = true, |
| }; |
| |
| panvk_lower_nir(dev, nir, info->set_layout_count, info->set_layouts, |
| info->robustness, &inputs, shader); |
| |
| result = panvk_compile_nir(dev, nir, info->flags, &inputs, shader); |
| |
| if (result != VK_SUCCESS) { |
| panvk_shader_destroy(&dev->vk, &shader->vk, pAllocator); |
| return result; |
| } |
| |
| result = panvk_shader_upload(dev, shader, pAllocator); |
| |
| if (result != VK_SUCCESS) { |
| panvk_shader_destroy(&dev->vk, &shader->vk, pAllocator); |
| return result; |
| } |
| |
| *shader_out = &shader->vk; |
| |
| return result; |
| } |
| |
| static VkResult |
| panvk_compile_shaders(struct vk_device *vk_dev, uint32_t shader_count, |
| struct vk_shader_compile_info *infos, |
| const struct vk_graphics_pipeline_state *state, |
| const VkAllocationCallbacks *pAllocator, |
| struct vk_shader **shaders_out) |
| { |
| struct panvk_device *dev = to_panvk_device(vk_dev); |
| VkResult result; |
| uint32_t i; |
| |
| for (i = 0; i < shader_count; i++) { |
| result = panvk_compile_shader(dev, &infos[i], state, pAllocator, |
| &shaders_out[i]); |
| |
| /* Clean up NIR for the current shader */ |
| ralloc_free(infos[i].nir); |
| |
| if (result != VK_SUCCESS) |
| goto err_cleanup; |
| } |
| |
| /* TODO: If we get multiple shaders here, we can perform part of the link |
| * logic at compile time. */ |
| |
| return VK_SUCCESS; |
| |
| err_cleanup: |
| /* Clean up all the shaders before this point */ |
| for (uint32_t j = 0; j < i; j++) |
| panvk_shader_destroy(&dev->vk, shaders_out[j], pAllocator); |
| |
| /* Clean up all the NIR after this point */ |
| for (uint32_t j = i + 1; j < shader_count; j++) |
| ralloc_free(infos[j].nir); |
| |
| /* Memset the output array */ |
| memset(shaders_out, 0, shader_count * sizeof(*shaders_out)); |
| |
| return result; |
| } |
| |
| static VkResult |
| shader_desc_info_deserialize(struct blob_reader *blob, |
| struct panvk_shader *shader) |
| { |
| shader->desc_info.used_set_mask = blob_read_uint32(blob); |
| shader->desc_info.dyn_ubos.count = blob_read_uint32(blob); |
| blob_copy_bytes(blob, shader->desc_info.dyn_ubos.map, |
| shader->desc_info.dyn_ubos.count); |
| shader->desc_info.dyn_ssbos.count = blob_read_uint32(blob); |
| blob_copy_bytes(blob, shader->desc_info.dyn_ssbos.map, |
| shader->desc_info.dyn_ssbos.count); |
| |
| uint32_t others_count = 0; |
| for (unsigned i = 0; i < ARRAY_SIZE(shader->desc_info.others.count); i++) { |
| shader->desc_info.others.count[i] = blob_read_uint32(blob); |
| others_count += shader->desc_info.others.count[i]; |
| } |
| |
| if (others_count) { |
| struct panvk_device *dev = to_panvk_device(shader->vk.base.device); |
| struct panvk_pool_alloc_info alloc_info = { |
| .size = others_count * sizeof(uint32_t), |
| .alignment = sizeof(uint32_t), |
| }; |
| shader->desc_info.others.map = |
| panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info); |
| uint32_t *copy_table = |
| panvk_priv_mem_host_addr(shader->desc_info.others.map); |
| |
| if (!copy_table) |
| return VK_ERROR_OUT_OF_DEVICE_MEMORY; |
| |
| blob_copy_bytes(blob, copy_table, others_count * sizeof(*copy_table)); |
| } |
| |
| return VK_SUCCESS; |
| } |
| |
| static VkResult |
| panvk_deserialize_shader(struct vk_device *vk_dev, struct blob_reader *blob, |
| uint32_t binary_version, |
| const VkAllocationCallbacks *pAllocator, |
| struct vk_shader **shader_out) |
| { |
| struct panvk_device *device = to_panvk_device(vk_dev); |
| struct panvk_shader *shader; |
| VkResult result; |
| |
| struct pan_shader_info info; |
| blob_copy_bytes(blob, &info, sizeof(info)); |
| |
| struct pan_compute_dim local_size; |
| blob_copy_bytes(blob, &local_size, sizeof(local_size)); |
| |
| const uint32_t bin_size = blob_read_uint32(blob); |
| |
| if (blob->overrun) |
| return vk_error(device, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT); |
| |
| shader = vk_shader_zalloc(vk_dev, &panvk_shader_ops, info.stage, pAllocator, |
| sizeof(*shader)); |
| if (shader == NULL) |
| return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| |
| shader->info = info; |
| shader->local_size = local_size; |
| shader->bin_size = bin_size; |
| |
| shader->bin_ptr = malloc(bin_size); |
| if (shader->bin_ptr == NULL) { |
| panvk_shader_destroy(vk_dev, &shader->vk, pAllocator); |
| return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| } |
| |
| blob_copy_bytes(blob, (void *)shader->bin_ptr, shader->bin_size); |
| |
| result = shader_desc_info_deserialize(blob, shader); |
| |
| if (result != VK_SUCCESS) { |
| panvk_shader_destroy(vk_dev, &shader->vk, pAllocator); |
| return vk_error(device, result); |
| } |
| |
| if (blob->overrun) { |
| panvk_shader_destroy(vk_dev, &shader->vk, pAllocator); |
| return vk_error(device, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT); |
| } |
| |
| result = panvk_shader_upload(device, shader, pAllocator); |
| |
| if (result != VK_SUCCESS) { |
| panvk_shader_destroy(vk_dev, &shader->vk, pAllocator); |
| return result; |
| } |
| |
| *shader_out = &shader->vk; |
| |
| return result; |
| } |
| |
| static void |
| shader_desc_info_serialize(struct blob *blob, const struct panvk_shader *shader) |
| { |
| blob_write_uint32(blob, shader->desc_info.used_set_mask); |
| blob_write_uint32(blob, shader->desc_info.dyn_ubos.count); |
| blob_write_bytes(blob, shader->desc_info.dyn_ubos.map, |
| sizeof(*shader->desc_info.dyn_ubos.map) * |
| shader->desc_info.dyn_ubos.count); |
| blob_write_uint32(blob, shader->desc_info.dyn_ssbos.count); |
| blob_write_bytes(blob, shader->desc_info.dyn_ssbos.map, |
| sizeof(*shader->desc_info.dyn_ssbos.map) * |
| shader->desc_info.dyn_ssbos.count); |
| |
| unsigned others_count = 0; |
| for (unsigned i = 0; i < ARRAY_SIZE(shader->desc_info.others.count); i++) { |
| blob_write_uint32(blob, shader->desc_info.others.count[i]); |
| others_count += shader->desc_info.others.count[i]; |
| } |
| |
| blob_write_bytes(blob, |
| panvk_priv_mem_host_addr(shader->desc_info.others.map), |
| sizeof(uint32_t) * others_count); |
| } |
| |
| static bool |
| panvk_shader_serialize(struct vk_device *vk_dev, |
| const struct vk_shader *vk_shader, struct blob *blob) |
| { |
| struct panvk_shader *shader = |
| container_of(vk_shader, struct panvk_shader, vk); |
| |
| /** |
| * We can't currently cache assembly |
| * TODO: Implement seriaization with assembly |
| **/ |
| if (shader->nir_str != NULL || shader->asm_str != NULL) |
| return false; |
| |
| blob_write_bytes(blob, &shader->info, sizeof(shader->info)); |
| blob_write_bytes(blob, &shader->local_size, sizeof(shader->local_size)); |
| blob_write_uint32(blob, shader->bin_size); |
| blob_write_bytes(blob, shader->bin_ptr, shader->bin_size); |
| shader_desc_info_serialize(blob, shader); |
| |
| return !blob->out_of_memory; |
| } |
| |
| #define WRITE_STR(field, ...) \ |
| ({ \ |
| memset(field, 0, sizeof(field)); \ |
| UNUSED int i = snprintf(field, sizeof(field), __VA_ARGS__); \ |
| assert(i > 0 && i < sizeof(field)); \ |
| }) |
| |
| static VkResult |
| panvk_shader_get_executable_properties( |
| UNUSED struct vk_device *device, const struct vk_shader *vk_shader, |
| uint32_t *executable_count, VkPipelineExecutablePropertiesKHR *properties) |
| { |
| UNUSED struct panvk_shader *shader = |
| container_of(vk_shader, struct panvk_shader, vk); |
| |
| VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out, properties, |
| executable_count); |
| |
| vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) |
| { |
| props->stages = mesa_to_vk_shader_stage(shader->info.stage); |
| props->subgroupSize = 8; |
| WRITE_STR(props->name, "%s", |
| _mesa_shader_stage_to_string(shader->info.stage)); |
| WRITE_STR(props->description, "%s shader", |
| _mesa_shader_stage_to_string(shader->info.stage)); |
| } |
| |
| return vk_outarray_status(&out); |
| } |
| |
| static VkResult |
| panvk_shader_get_executable_statistics( |
| UNUSED struct vk_device *device, const struct vk_shader *vk_shader, |
| uint32_t executable_index, uint32_t *statistic_count, |
| VkPipelineExecutableStatisticKHR *statistics) |
| { |
| UNUSED struct panvk_shader *shader = |
| container_of(vk_shader, struct panvk_shader, vk); |
| |
| VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, statistics, |
| statistic_count); |
| |
| assert(executable_index == 0); |
| |
| vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) |
| { |
| WRITE_STR(stat->name, "Code Size"); |
| WRITE_STR(stat->description, |
| "Size of the compiled shader binary, in bytes"); |
| stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; |
| stat->value.u64 = shader->bin_size; |
| } |
| |
| /* TODO: more executable statistics (VK_KHR_pipeline_executable_properties) */ |
| |
| return vk_outarray_status(&out); |
| } |
| |
| static bool |
| write_ir_text(VkPipelineExecutableInternalRepresentationKHR *ir, |
| const char *data) |
| { |
| ir->isText = VK_TRUE; |
| |
| size_t data_len = strlen(data) + 1; |
| |
| if (ir->pData == NULL) { |
| ir->dataSize = data_len; |
| return true; |
| } |
| |
| strncpy(ir->pData, data, ir->dataSize); |
| if (ir->dataSize < data_len) |
| return false; |
| |
| ir->dataSize = data_len; |
| return true; |
| } |
| |
| static VkResult |
| panvk_shader_get_executable_internal_representations( |
| UNUSED struct vk_device *device, const struct vk_shader *vk_shader, |
| uint32_t executable_index, uint32_t *internal_representation_count, |
| VkPipelineExecutableInternalRepresentationKHR *internal_representations) |
| { |
| UNUSED struct panvk_shader *shader = |
| container_of(vk_shader, struct panvk_shader, vk); |
| VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out, |
| internal_representations, |
| internal_representation_count); |
| bool incomplete_text = false; |
| |
| if (shader->nir_str != NULL) { |
| vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, |
| &out, ir) |
| { |
| WRITE_STR(ir->name, "NIR shader"); |
| WRITE_STR(ir->description, |
| "NIR shader before sending to the back-end compiler"); |
| if (!write_ir_text(ir, shader->nir_str)) |
| incomplete_text = true; |
| } |
| } |
| |
| if (shader->asm_str != NULL) { |
| vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, |
| &out, ir) |
| { |
| WRITE_STR(ir->name, "Assembly"); |
| WRITE_STR(ir->description, "Final Assembly"); |
| if (!write_ir_text(ir, shader->asm_str)) |
| incomplete_text = true; |
| } |
| } |
| |
| return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out); |
| } |
| |
| static mali_pixel_format |
| get_varying_format(gl_shader_stage stage, gl_varying_slot loc, |
| enum pipe_format pfmt) |
| { |
| switch (loc) { |
| case VARYING_SLOT_PNTC: |
| case VARYING_SLOT_PSIZ: |
| #if PAN_ARCH <= 6 |
| return (MALI_R16F << 12) | panfrost_get_default_swizzle(1); |
| #else |
| return (MALI_R16F << 12) | MALI_RGB_COMPONENT_ORDER_R000; |
| #endif |
| case VARYING_SLOT_POS: |
| #if PAN_ARCH <= 6 |
| return (MALI_SNAP_4 << 12) | panfrost_get_default_swizzle(4); |
| #else |
| return (MALI_SNAP_4 << 12) | MALI_RGB_COMPONENT_ORDER_RGBA; |
| #endif |
| default: |
| assert(pfmt != PIPE_FORMAT_NONE); |
| return GENX(panfrost_format_from_pipe_format)(pfmt)->hw; |
| } |
| } |
| |
| struct varyings_info { |
| enum pipe_format fmts[VARYING_SLOT_MAX]; |
| BITSET_DECLARE(active, VARYING_SLOT_MAX); |
| }; |
| |
| static void |
| collect_varyings_info(const struct pan_shader_varying *varyings, |
| unsigned varying_count, struct varyings_info *info) |
| { |
| for (unsigned i = 0; i < varying_count; i++) { |
| gl_varying_slot loc = varyings[i].location; |
| |
| if (varyings[i].format == PIPE_FORMAT_NONE) |
| continue; |
| |
| info->fmts[loc] = varyings[i].format; |
| BITSET_SET(info->active, loc); |
| } |
| } |
| |
| static inline enum panvk_varying_buf_id |
| varying_buf_id(gl_varying_slot loc) |
| { |
| switch (loc) { |
| case VARYING_SLOT_POS: |
| return PANVK_VARY_BUF_POSITION; |
| case VARYING_SLOT_PSIZ: |
| return PANVK_VARY_BUF_PSIZ; |
| default: |
| return PANVK_VARY_BUF_GENERAL; |
| } |
| } |
| |
| static mali_pixel_format |
| varying_format(gl_varying_slot loc, enum pipe_format pfmt) |
| { |
| switch (loc) { |
| case VARYING_SLOT_PNTC: |
| case VARYING_SLOT_PSIZ: |
| #if PAN_ARCH <= 6 |
| return (MALI_R16F << 12) | panfrost_get_default_swizzle(1); |
| #else |
| return (MALI_R16F << 12) | MALI_RGB_COMPONENT_ORDER_R000; |
| #endif |
| case VARYING_SLOT_POS: |
| #if PAN_ARCH <= 6 |
| return (MALI_SNAP_4 << 12) | panfrost_get_default_swizzle(4); |
| #else |
| return (MALI_SNAP_4 << 12) | MALI_RGB_COMPONENT_ORDER_RGBA; |
| #endif |
| default: |
| return GENX(panfrost_format_from_pipe_format)(pfmt)->hw; |
| } |
| } |
| |
| static struct panvk_priv_mem |
| emit_varying_attrs(struct panvk_pool *desc_pool, |
| const struct pan_shader_varying *varyings, |
| unsigned varying_count, const struct varyings_info *info, |
| unsigned *buf_offsets) |
| { |
| unsigned attr_count = BITSET_COUNT(info->active); |
| struct panvk_priv_mem mem = |
| panvk_pool_alloc_desc_array(desc_pool, attr_count, ATTRIBUTE); |
| struct mali_attribute_packed *attrs = panvk_priv_mem_host_addr(mem); |
| unsigned attr_idx = 0; |
| |
| for (unsigned i = 0; i < varying_count; i++) { |
| pan_pack(&attrs[attr_idx++], ATTRIBUTE, cfg) { |
| gl_varying_slot loc = varyings[i].location; |
| enum pipe_format pfmt = varyings[i].format != PIPE_FORMAT_NONE |
| ? info->fmts[loc] |
| : PIPE_FORMAT_NONE; |
| |
| if (pfmt == PIPE_FORMAT_NONE) { |
| #if PAN_ARCH >= 7 |
| cfg.format = (MALI_CONSTANT << 12) | MALI_RGB_COMPONENT_ORDER_0000; |
| #else |
| cfg.format = (MALI_CONSTANT << 12) | PAN_V6_SWIZZLE(0, 0, 0, 0); |
| #endif |
| } else { |
| cfg.buffer_index = varying_buf_id(loc); |
| cfg.offset = buf_offsets[loc]; |
| cfg.format = varying_format(loc, info->fmts[loc]); |
| } |
| cfg.offset_enable = false; |
| } |
| } |
| |
| return mem; |
| } |
| |
| void |
| panvk_per_arch(link_shaders)(struct panvk_pool *desc_pool, |
| struct panvk_shader *vs, struct panvk_shader *fs, |
| struct panvk_shader_link *link) |
| { |
| BITSET_DECLARE(active_attrs, VARYING_SLOT_MAX) = {0}; |
| unsigned buf_strides[PANVK_VARY_BUF_MAX] = {0}; |
| unsigned buf_offsets[VARYING_SLOT_MAX] = {0}; |
| struct varyings_info out_vars = {0}; |
| struct varyings_info in_vars = {0}; |
| unsigned loc; |
| |
| assert(vs); |
| assert(vs->info.stage == MESA_SHADER_VERTEX); |
| |
| collect_varyings_info(vs->info.varyings.output, |
| vs->info.varyings.output_count, &out_vars); |
| |
| if (fs) { |
| assert(fs->info.stage == MESA_SHADER_FRAGMENT); |
| collect_varyings_info(fs->info.varyings.input, |
| fs->info.varyings.input_count, &in_vars); |
| } |
| |
| BITSET_OR(active_attrs, in_vars.active, out_vars.active); |
| |
| /* Handle the position and point size buffers explicitly, as they are |
| * passed through separate buffer pointers to the tiler job. |
| */ |
| if (BITSET_TEST(out_vars.active, VARYING_SLOT_POS)) { |
| buf_strides[PANVK_VARY_BUF_POSITION] = sizeof(float) * 4; |
| BITSET_CLEAR(active_attrs, VARYING_SLOT_POS); |
| } |
| |
| if (BITSET_TEST(out_vars.active, VARYING_SLOT_PSIZ)) { |
| buf_strides[PANVK_VARY_BUF_PSIZ] = sizeof(uint16_t); |
| BITSET_CLEAR(active_attrs, VARYING_SLOT_PSIZ); |
| } |
| |
| BITSET_FOREACH_SET(loc, active_attrs, VARYING_SLOT_MAX) { |
| /* We expect the VS to write to all inputs read by the FS, and the |
| * FS to read all inputs written by the VS. If that's not the |
| * case, we keep PIPE_FORMAT_NONE to reflect the fact we should use a |
| * sink attribute (writes are discarded, reads return zeros). |
| */ |
| if (in_vars.fmts[loc] == PIPE_FORMAT_NONE || |
| out_vars.fmts[loc] == PIPE_FORMAT_NONE) { |
| in_vars.fmts[loc] = PIPE_FORMAT_NONE; |
| out_vars.fmts[loc] = PIPE_FORMAT_NONE; |
| continue; |
| } |
| |
| unsigned out_size = util_format_get_blocksize(out_vars.fmts[loc]); |
| unsigned buf_idx = varying_buf_id(loc); |
| |
| /* Always trust the VS input format, so we can: |
| * - discard components that are never read |
| * - use float types for interpolated fragment shader inputs |
| * - use fp16 for floats with mediump |
| * - make sure components that are not written by the FS are set to zero |
| */ |
| out_vars.fmts[loc] = in_vars.fmts[loc]; |
| |
| /* Special buffers are handled explicitly before this loop, everything |
| * else should be laid out in the general varying buffer. |
| */ |
| assert(buf_idx == PANVK_VARY_BUF_GENERAL); |
| |
| /* Keep things aligned a 32-bit component. */ |
| buf_offsets[loc] = buf_strides[buf_idx]; |
| buf_strides[buf_idx] += ALIGN_POT(out_size, 4); |
| } |
| |
| link->vs.attribs = emit_varying_attrs(desc_pool, vs->info.varyings.output, |
| vs->info.varyings.output_count, |
| &out_vars, buf_offsets); |
| |
| if (fs) |
| link->fs.attribs = emit_varying_attrs(desc_pool, fs->info.varyings.input, |
| fs->info.varyings.input_count, |
| &in_vars, buf_offsets); |
| |
| memcpy(link->buf_strides, buf_strides, sizeof(link->buf_strides)); |
| } |
| |
| static const struct vk_shader_ops panvk_shader_ops = { |
| .destroy = panvk_shader_destroy, |
| .serialize = panvk_shader_serialize, |
| .get_executable_properties = panvk_shader_get_executable_properties, |
| .get_executable_statistics = panvk_shader_get_executable_statistics, |
| .get_executable_internal_representations = |
| panvk_shader_get_executable_internal_representations, |
| }; |
| |
| const struct vk_device_shader_ops panvk_per_arch(device_shader_ops) = { |
| .get_nir_options = panvk_get_nir_options, |
| .get_spirv_options = panvk_get_spirv_options, |
| .preprocess_nir = panvk_preprocess_nir, |
| .hash_graphics_state = panvk_hash_graphics_state, |
| .compile = panvk_compile_shaders, |
| .deserialize = panvk_deserialize_shader, |
| .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state, |
| .cmd_bind_shaders = panvk_per_arch(cmd_bind_shaders), |
| }; |