| /* |
| * Copyright © 2011 Intel Corporation |
| * SPDX-License-Identifier: MIT |
| */ |
| |
| #include "brw_shader.h" |
| #include "brw_generator.h" |
| #include "brw_eu.h" |
| #include "brw_nir.h" |
| #include "brw_private.h" |
| #include "dev/intel_debug.h" |
| |
| static void |
| brw_assign_vs_urb_setup(brw_shader &s) |
| { |
| struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(s.prog_data); |
| |
| assert(s.stage == MESA_SHADER_VERTEX); |
| |
| /* Each attribute is 4 regs. */ |
| s.first_non_payload_grf += 8 * vs_prog_data->base.urb_read_length; |
| |
| assert(vs_prog_data->base.urb_read_length <= 15); |
| |
| /* Rewrite all ATTR file references to the hw grf that they land in. */ |
| foreach_block_and_inst(block, brw_inst, inst, s.cfg) { |
| s.convert_attr_sources_to_hw_regs(inst); |
| } |
| } |
| |
| static unsigned |
| brw_nir_pack_vs_input(nir_shader *nir, struct brw_vs_prog_data *prog_data) |
| { |
| struct vf_attribute { |
| unsigned reg_offset; |
| uint8_t component_mask; |
| bool is_64bit:1; |
| bool is_used:1; |
| } attributes[MAX_HW_VERT_ATTRIB] = {}; |
| |
| /* IO lowering is going to break dmat inputs into a location each, so we |
| * need to reproduce the 64bit nature of the variable into each slot. |
| */ |
| nir_foreach_shader_in_variable(var, nir) { |
| const bool is_64bit = glsl_type_is_64bit(var->type); |
| const uint32_t slots = glsl_count_vec4_slots(var->type, true, false); |
| for (uint32_t i = 0; i < slots; i++) |
| attributes[var->data.location + i].is_64bit = is_64bit; |
| } |
| |
| /* First mark all used inputs */ |
| nir_foreach_function_impl(impl, nir) { |
| nir_foreach_block(block, impl) { |
| nir_foreach_instr(instr, block) { |
| if (instr->type != nir_instr_type_intrinsic) |
| continue; |
| |
| nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); |
| if (intrin->intrinsic != nir_intrinsic_load_input) |
| continue; |
| |
| assert(intrin->def.bit_size == 32); |
| |
| const struct nir_io_semantics io = |
| nir_intrinsic_io_semantics(intrin); |
| |
| attributes[io.location].is_used = true; |
| |
| /* SKL PRMs, Vol 2a: Command Reference: Instructions, |
| * 3DSTATE_VF_COMPONENT_PACKING: |
| * |
| * "Software shall enable all components (XYZW) for any and all |
| * VERTEX_ELEMENTs associated with a 256-bit SURFACE_FORMAT. |
| * It is INVALID to disable any components in these cases." |
| * |
| * Enable this XYZW for any > 128-bit format. |
| */ |
| if (nir->info.dual_slot_inputs & BITFIELD64_BIT(io.location)) { |
| attributes[io.location].component_mask |= 0xff; |
| } else { |
| const uint8_t mask = |
| nir_component_mask(intrin->num_components) << |
| nir_intrinsic_component(intrin); |
| |
| attributes[io.location].component_mask |= mask; |
| } |
| } |
| } |
| } |
| |
| /* SKL PRMs, Vol 2a: Command Reference: Instructions, |
| * 3DSTATE_VF_COMPONENT_PACKING: |
| * |
| * "At least one component of one "valid" Vertex Element must be |
| * enabled." |
| */ |
| if (nir->info.inputs_read == 0) { |
| if (prog_data->no_vf_slot_compaction) { |
| attributes[VERT_ATTRIB_GENERIC0].is_used = true; |
| attributes[VERT_ATTRIB_GENERIC0].component_mask = 0x1; |
| } else if (!BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_IS_INDEXED_DRAW) && |
| !BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX) && |
| !BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE) && |
| !BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) && |
| !BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID) && |
| !BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID)) { |
| attributes[VERT_ATTRIB_GENERIC0].is_used = true; |
| attributes[VERT_ATTRIB_GENERIC0].component_mask = 0x1; |
| } |
| } |
| |
| /* Compute the register offsets */ |
| unsigned reg_offset = 0; |
| unsigned vertex_element = 0; |
| for (unsigned a = 0; a < ARRAY_SIZE(attributes); a++) { |
| if (!attributes[a].is_used) |
| continue; |
| |
| /* SKL PRMs, Vol 2a: Command Reference: Instructions, |
| * 3DSTATE_VF_COMPONENT_PACKING: |
| * |
| * "No enable bits are provided for Vertex Elements [32-33], |
| * and therefore no packing is performed on these elements (if |
| * Valid, all 4 components are stored)." |
| */ |
| if (vertex_element >= 32 || |
| (prog_data->no_vf_slot_compaction && a >= VERT_ATTRIB_GENERIC(32))) |
| attributes[a].component_mask = 0xf; |
| |
| attributes[a].reg_offset = reg_offset; |
| |
| reg_offset += util_bitcount(attributes[a].component_mask); |
| vertex_element++; |
| } |
| |
| /* Remap inputs */ |
| nir_foreach_function_impl(impl, nir) { |
| nir_foreach_block(block, impl) { |
| nir_foreach_instr(instr, block) { |
| if (instr->type != nir_instr_type_intrinsic) |
| continue; |
| |
| nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); |
| if (intrin->intrinsic != nir_intrinsic_load_input) |
| continue; |
| |
| struct nir_io_semantics io = nir_intrinsic_io_semantics(intrin); |
| |
| unsigned slot = attributes[io.location].reg_offset / 4; |
| unsigned slot_component = |
| attributes[io.location].reg_offset % 4 + |
| util_bitcount(attributes[io.location].component_mask & |
| BITFIELD_MASK(io.high_dvec2 * 4 + |
| nir_intrinsic_component(intrin))); |
| |
| slot += slot_component / 4; |
| slot_component %= 4; |
| |
| nir_intrinsic_set_base(intrin, slot); |
| nir_intrinsic_set_component(intrin, slot_component); |
| } |
| } |
| } |
| |
| /* Generate the packing array, we start from the first application |
| * attribute : VERT_ATTRIB_GENERIC0 |
| */ |
| unsigned vf_element_count = 0; |
| for (unsigned a = VERT_ATTRIB_GENERIC0; a < ARRAY_SIZE(attributes) && vf_element_count < 32; a++) { |
| /* Consider all attributes used when no slot compaction is active */ |
| if (!attributes[a].is_used && !prog_data->no_vf_slot_compaction) |
| continue; |
| |
| uint32_t mask; |
| /* Stores masks in attributes[a].component_mask are in terms of 32-bit |
| * components, but the HW depending on the format will interpret |
| * prog_data->vf_component_packing[] bits as either a 32-bit or 64-bit |
| * component. So we need to only consider every other bit. |
| */ |
| if (attributes[a].is_64bit) { |
| mask = 0; |
| u_foreach_bit(b, attributes[a].component_mask) |
| mask |= BITFIELD_BIT(b / 2); |
| } else { |
| mask = attributes[a].component_mask; |
| } |
| /* We should only have 4bits enabled max */ |
| assert((mask & ~0xfu) == 0); |
| |
| prog_data->vf_component_packing[vf_element_count / 8] |= |
| mask << (4 * (vf_element_count % 8)); |
| vf_element_count++; |
| } |
| |
| return reg_offset; |
| } |
| |
| static bool |
| run_vs(brw_shader &s) |
| { |
| assert(s.stage == MESA_SHADER_VERTEX); |
| |
| s.payload_ = new brw_vs_thread_payload(s); |
| |
| brw_from_nir(&s); |
| |
| if (s.failed) |
| return false; |
| |
| s.emit_urb_writes(); |
| |
| brw_calculate_cfg(s); |
| |
| brw_optimize(s); |
| |
| s.assign_curb_setup(); |
| brw_assign_vs_urb_setup(s); |
| |
| brw_lower_3src_null_dest(s); |
| brw_workaround_emit_dummy_mov_instruction(s); |
| |
| brw_allocate_registers(s, true /* allow_spilling */); |
| |
| brw_workaround_source_arf_before_eot(s); |
| |
| return !s.failed; |
| } |
| |
| extern "C" const unsigned * |
| brw_compile_vs(const struct brw_compiler *compiler, |
| struct brw_compile_vs_params *params) |
| { |
| struct nir_shader *nir = params->base.nir; |
| const struct brw_vs_prog_key *key = params->key; |
| struct brw_vs_prog_data *prog_data = params->prog_data; |
| const bool debug_enabled = |
| brw_should_print_shader(nir, params->base.debug_flag ? |
| params->base.debug_flag : DEBUG_VS, |
| params->base.source_hash); |
| const unsigned dispatch_width = brw_geometry_stage_dispatch_width(compiler->devinfo); |
| |
| /* We only expect slot compaction to be disabled when using device |
| * generated commands, to provide an independent 3DSTATE_VERTEX_ELEMENTS |
| * programming. This should always be enabled together with VF component |
| * packing to minimize the size of the payload. |
| */ |
| assert(!key->no_vf_slot_compaction || key->vf_component_packing); |
| |
| brw_prog_data_init(&prog_data->base.base, ¶ms->base); |
| |
| brw_nir_apply_key(nir, compiler, &key->base, dispatch_width); |
| |
| prog_data->inputs_read = nir->info.inputs_read; |
| prog_data->double_inputs_read = nir->info.vs.double_inputs; |
| prog_data->no_vf_slot_compaction = key->no_vf_slot_compaction; |
| |
| brw_nir_lower_vs_inputs(nir); |
| brw_nir_lower_vue_outputs(nir); |
| |
| memset(prog_data->vf_component_packing, 0, |
| sizeof(prog_data->vf_component_packing)); |
| unsigned nr_packed_regs = 0; |
| if (key->vf_component_packing) |
| nr_packed_regs = brw_nir_pack_vs_input(nir, prog_data); |
| |
| brw_postprocess_nir(nir, compiler, debug_enabled, |
| key->base.robust_flags); |
| |
| const bool has_clip_cull_dist = |
| nir->info.outputs_written & (VARYING_BIT_CLIP_DIST0 | |
| VARYING_BIT_CLIP_DIST1 | |
| VARYING_BIT_CULL_DIST0 | |
| VARYING_BIT_CULL_DIST1); |
| prog_data->base.clip_distance_mask = has_clip_cull_dist ? |
| ((1 << nir->info.clip_distance_array_size) - 1) : 0; |
| prog_data->base.cull_distance_mask = |
| (has_clip_cull_dist ? |
| ((1 << nir->info.cull_distance_array_size) - 1) : 0) << |
| nir->info.clip_distance_array_size; |
| |
| unsigned nr_attribute_slots = util_bitcount64(prog_data->inputs_read); |
| /* gl_VertexID and gl_InstanceID are system values, but arrive via an |
| * incoming vertex attribute. So, add an extra slot. |
| */ |
| if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX) || |
| BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE) || |
| BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) || |
| BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID)) { |
| nr_attribute_slots++; |
| } |
| |
| /* gl_DrawID and IsIndexedDraw share its very own vec4 */ |
| if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID) || |
| BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_IS_INDEXED_DRAW)) { |
| nr_attribute_slots++; |
| } |
| |
| if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_IS_INDEXED_DRAW)) |
| prog_data->uses_is_indexed_draw = true; |
| |
| if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX)) |
| prog_data->uses_firstvertex = true; |
| |
| if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE)) |
| prog_data->uses_baseinstance = true; |
| |
| if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE)) |
| prog_data->uses_vertexid = true; |
| |
| if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID)) |
| prog_data->uses_instanceid = true; |
| |
| if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID)) |
| prog_data->uses_drawid = true; |
| |
| unsigned nr_attribute_regs; |
| if (key->vf_component_packing) { |
| prog_data->base.urb_read_length = DIV_ROUND_UP(nr_packed_regs, 8); |
| nr_attribute_regs = nr_packed_regs; |
| } else { |
| prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attribute_slots, 2); |
| nr_attribute_regs = 4 * (nr_attribute_slots); |
| } |
| |
| /* Since vertex shaders reuse the same VUE entry for inputs and outputs |
| * (overwriting the original contents), we need to make sure the size is |
| * the larger of the two. |
| */ |
| const unsigned vue_entries = |
| MAX2(DIV_ROUND_UP(nr_attribute_regs, 4), |
| (unsigned)prog_data->base.vue_map.num_slots); |
| |
| prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4); |
| |
| if (unlikely(debug_enabled)) { |
| fprintf(stderr, "VS Output "); |
| brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_VERTEX); |
| } |
| |
| prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8; |
| |
| brw_shader v(compiler, ¶ms->base, &key->base, |
| &prog_data->base.base, nir, dispatch_width, |
| params->base.stats != NULL, debug_enabled); |
| if (!run_vs(v)) { |
| params->base.error_str = |
| ralloc_strdup(params->base.mem_ctx, v.fail_msg); |
| return NULL; |
| } |
| |
| assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0); |
| prog_data->base.base.dispatch_grf_start_reg = |
| v.payload().num_regs / reg_unit(compiler->devinfo); |
| prog_data->base.base.grf_used = v.grf_used; |
| |
| brw_generator g(compiler, ¶ms->base, |
| &prog_data->base.base, |
| MESA_SHADER_VERTEX); |
| if (unlikely(debug_enabled)) { |
| const char *debug_name = |
| ralloc_asprintf(params->base.mem_ctx, "%s vertex shader %s", |
| nir->info.label ? nir->info.label : |
| "unnamed", |
| nir->info.name); |
| |
| g.enable_debug(debug_name); |
| } |
| g.generate_code(v.cfg, dispatch_width, v.shader_stats, |
| v.performance_analysis.require(), params->base.stats); |
| g.add_const_data(nir->constant_data, nir->constant_data_size); |
| |
| return g.get_assembly(); |
| } |