| /* |
| * Copyright © 2013 Intel Corporation |
| * SPDX-License-Identifier: MIT |
| */ |
| |
| #include "brw_eu.h" |
| #include "intel_nir.h" |
| #include "brw_nir.h" |
| #include "brw_shader.h" |
| #include "brw_builder.h" |
| #include "brw_generator.h" |
| #include "brw_private.h" |
| #include "dev/intel_debug.h" |
| |
| /** |
| * Return the number of patches to accumulate before a MULTI_PATCH mode thread is |
| * launched. In cases with a large number of input control points and a large |
| * amount of VS outputs, the VS URB space needed to store an entire 8 patches |
| * worth of data can be prohibitive, so it can be beneficial to launch threads |
| * early. |
| * |
| * See the 3DSTATE_HS::Patch Count Threshold documentation for the recommended |
| * values. Note that 0 means to "disable" early dispatch, meaning to wait for |
| * a full 8 patches as normal. |
| */ |
| static int |
| get_patch_count_threshold(int input_control_points) |
| { |
| if (input_control_points <= 4) |
| return 0; |
| else if (input_control_points <= 6) |
| return 5; |
| else if (input_control_points <= 8) |
| return 4; |
| else if (input_control_points <= 10) |
| return 3; |
| else if (input_control_points <= 14) |
| return 2; |
| |
| /* Return patch count 1 for PATCHLIST_15 - PATCHLIST_32 */ |
| return 1; |
| } |
| |
| static void |
| brw_set_tcs_invocation_id(brw_shader &s) |
| { |
| const struct intel_device_info *devinfo = s.devinfo; |
| struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(s.prog_data); |
| struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base; |
| const brw_builder bld = brw_builder(&s); |
| |
| const unsigned instance_id_mask = |
| (devinfo->verx10 >= 125) ? INTEL_MASK(7, 0) : |
| (devinfo->ver >= 11) ? INTEL_MASK(22, 16) : |
| INTEL_MASK(23, 17); |
| const unsigned instance_id_shift = |
| (devinfo->verx10 >= 125) ? 0 : (devinfo->ver >= 11) ? 16 : 17; |
| |
| /* Get instance number from g0.2 bits: |
| * * 7:0 on DG2+ |
| * * 22:16 on gfx11+ |
| * * 23:17 otherwise |
| */ |
| brw_reg t = |
| bld.AND(brw_reg(retype(brw_vec1_grf(0, 2), BRW_TYPE_UD)), |
| brw_imm_ud(instance_id_mask)); |
| |
| if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH) { |
| /* gl_InvocationID is just the thread number */ |
| s.invocation_id = bld.SHR(t, brw_imm_ud(instance_id_shift)); |
| return; |
| } |
| |
| assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH); |
| |
| brw_reg channels_uw = bld.vgrf(BRW_TYPE_UW); |
| brw_reg channels_ud = bld.vgrf(BRW_TYPE_UD); |
| bld.MOV(channels_uw, brw_reg(brw_imm_uv(0x76543210))); |
| bld.MOV(channels_ud, channels_uw); |
| |
| if (tcs_prog_data->instances == 1) { |
| s.invocation_id = channels_ud; |
| } else { |
| /* instance_id = 8 * t + <76543210> */ |
| s.invocation_id = |
| bld.ADD(bld.SHR(t, brw_imm_ud(instance_id_shift - 3)), channels_ud); |
| } |
| } |
| |
| static void |
| brw_emit_tcs_thread_end(brw_shader &s) |
| { |
| /* Try and tag the last URB write with EOT instead of emitting a whole |
| * separate write just to finish the thread. There isn't guaranteed to |
| * be one, so this may not succeed. |
| */ |
| if (s.mark_last_urb_write_with_eot()) |
| return; |
| |
| const brw_builder bld = brw_builder(&s); |
| |
| /* Emit a URB write to end the thread. On Broadwell, we use this to write |
| * zero to the "TR DS Cache Disable" bit (we haven't implemented a fancy |
| * algorithm to set it optimally). On other platforms, we simply write |
| * zero to a reserved/MBZ patch header DWord which has no consequence. |
| */ |
| brw_reg srcs[URB_LOGICAL_NUM_SRCS]; |
| srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output; |
| srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(WRITEMASK_X << 16); |
| srcs[URB_LOGICAL_SRC_DATA] = brw_imm_ud(0); |
| srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1); |
| brw_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, |
| reg_undef, srcs, ARRAY_SIZE(srcs)); |
| inst->eot = true; |
| } |
| |
| static void |
| brw_assign_tcs_urb_setup(brw_shader &s) |
| { |
| assert(s.stage == MESA_SHADER_TESS_CTRL); |
| |
| /* Rewrite all ATTR file references to HW_REGs. */ |
| foreach_block_and_inst(block, brw_inst, inst, s.cfg) { |
| s.convert_attr_sources_to_hw_regs(inst); |
| } |
| } |
| |
| static bool |
| run_tcs(brw_shader &s) |
| { |
| assert(s.stage == MESA_SHADER_TESS_CTRL); |
| |
| struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(s.prog_data); |
| const brw_builder bld = brw_builder(&s); |
| |
| assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH || |
| vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH); |
| |
| s.payload_ = new brw_tcs_thread_payload(s); |
| |
| /* Initialize gl_InvocationID */ |
| brw_set_tcs_invocation_id(s); |
| |
| const bool fix_dispatch_mask = |
| vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH && |
| (s.nir->info.tess.tcs_vertices_out % 8) != 0; |
| |
| /* Fix the disptach mask */ |
| if (fix_dispatch_mask) { |
| bld.CMP(bld.null_reg_ud(), s.invocation_id, |
| brw_imm_ud(s.nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L); |
| bld.IF(BRW_PREDICATE_NORMAL); |
| } |
| |
| brw_from_nir(&s); |
| |
| if (fix_dispatch_mask) { |
| bld.emit(BRW_OPCODE_ENDIF); |
| } |
| |
| brw_emit_tcs_thread_end(s); |
| |
| if (s.failed) |
| return false; |
| |
| brw_calculate_cfg(s); |
| |
| brw_optimize(s); |
| |
| s.assign_curb_setup(); |
| brw_assign_tcs_urb_setup(s); |
| |
| brw_lower_3src_null_dest(s); |
| brw_workaround_emit_dummy_mov_instruction(s); |
| |
| brw_allocate_registers(s, true /* allow_spilling */); |
| |
| brw_workaround_source_arf_before_eot(s); |
| |
| return !s.failed; |
| } |
| |
| extern "C" const unsigned * |
| brw_compile_tcs(const struct brw_compiler *compiler, |
| struct brw_compile_tcs_params *params) |
| { |
| const struct intel_device_info *devinfo = compiler->devinfo; |
| nir_shader *nir = params->base.nir; |
| const struct brw_tcs_prog_key *key = params->key; |
| struct brw_tcs_prog_data *prog_data = params->prog_data; |
| struct brw_vue_prog_data *vue_prog_data = &prog_data->base; |
| const unsigned dispatch_width = brw_geometry_stage_dispatch_width(compiler->devinfo); |
| |
| const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TCS, params->base.source_hash); |
| |
| brw_prog_data_init(&prog_data->base.base, ¶ms->base); |
| |
| nir->info.outputs_written = key->outputs_written; |
| nir->info.patch_outputs_written = key->patch_outputs_written; |
| |
| struct intel_vue_map input_vue_map; |
| brw_compute_vue_map(devinfo, &input_vue_map, nir->info.inputs_read, |
| key->base.vue_layout, 1); |
| brw_compute_tess_vue_map(&vue_prog_data->vue_map, |
| nir->info.outputs_written, |
| nir->info.patch_outputs_written); |
| |
| brw_nir_apply_key(nir, compiler, &key->base, dispatch_width); |
| brw_nir_lower_vue_inputs(nir, &input_vue_map); |
| brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map, |
| key->_tes_primitive_mode); |
| if (key->input_vertices > 0) |
| intel_nir_lower_patch_vertices_in(nir, key->input_vertices, NULL, NULL); |
| |
| brw_postprocess_nir(nir, compiler, debug_enabled, |
| key->base.robust_flags); |
| |
| bool has_primitive_id = |
| BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID); |
| |
| prog_data->input_vertices = key->input_vertices; |
| prog_data->patch_count_threshold = get_patch_count_threshold(key->input_vertices); |
| |
| if (compiler->use_tcs_multi_patch) { |
| vue_prog_data->dispatch_mode = INTEL_DISPATCH_MODE_TCS_MULTI_PATCH; |
| prog_data->instances = nir->info.tess.tcs_vertices_out; |
| prog_data->include_primitive_id = has_primitive_id; |
| } else { |
| unsigned verts_per_thread = 8; |
| vue_prog_data->dispatch_mode = INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH; |
| prog_data->instances = |
| DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, verts_per_thread); |
| } |
| |
| /* Compute URB entry size. The maximum allowed URB entry size is 32k. |
| * That divides up as follows: |
| * |
| * 32 bytes for the patch header (tessellation factors) |
| * 480 bytes for per-patch varyings (a varying component is 4 bytes and |
| * gl_MaxTessPatchComponents = 120) |
| * 16384 bytes for per-vertex varyings (a varying component is 4 bytes, |
| * gl_MaxPatchVertices = 32 and |
| * gl_MaxTessControlOutputComponents = 128) |
| * |
| * 15808 bytes left for varying packing overhead |
| */ |
| const int num_per_patch_slots = vue_prog_data->vue_map.num_per_patch_slots; |
| const int num_per_vertex_slots = vue_prog_data->vue_map.num_per_vertex_slots; |
| unsigned output_size_bytes = 0; |
| /* Note that the patch header is counted in num_per_patch_slots. */ |
| output_size_bytes += num_per_patch_slots * 16; |
| output_size_bytes += nir->info.tess.tcs_vertices_out * |
| num_per_vertex_slots * 16; |
| |
| assert(output_size_bytes >= 1); |
| if (output_size_bytes > GFX7_MAX_HS_URB_ENTRY_SIZE_BYTES) |
| return NULL; |
| |
| /* URB entry sizes are stored as a multiple of 64 bytes. */ |
| vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64; |
| |
| /* HS does not use the usual payload pushing from URB to GRFs, |
| * because we don't have enough registers for a full-size payload, and |
| * the hardware is broken on Haswell anyway. |
| */ |
| vue_prog_data->urb_read_length = 0; |
| |
| if (unlikely(debug_enabled)) { |
| fprintf(stderr, "TCS Input "); |
| brw_print_vue_map(stderr, &input_vue_map, MESA_SHADER_TESS_CTRL); |
| fprintf(stderr, "TCS Output "); |
| brw_print_vue_map(stderr, &vue_prog_data->vue_map, MESA_SHADER_TESS_CTRL); |
| } |
| |
| brw_shader v(compiler, ¶ms->base, &key->base, |
| &prog_data->base.base, nir, dispatch_width, |
| params->base.stats != NULL, debug_enabled); |
| if (!run_tcs(v)) { |
| params->base.error_str = |
| ralloc_strdup(params->base.mem_ctx, v.fail_msg); |
| return NULL; |
| } |
| |
| assert(v.payload().num_regs % reg_unit(devinfo) == 0); |
| prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo); |
| prog_data->base.base.grf_used = v.grf_used; |
| |
| brw_generator g(compiler, ¶ms->base, |
| &prog_data->base.base, MESA_SHADER_TESS_CTRL); |
| if (unlikely(debug_enabled)) { |
| g.enable_debug(ralloc_asprintf(params->base.mem_ctx, |
| "%s tessellation control shader %s", |
| nir->info.label ? nir->info.label |
| : "unnamed", |
| nir->info.name)); |
| } |
| |
| g.generate_code(v.cfg, dispatch_width, v.shader_stats, |
| v.performance_analysis.require(), params->base.stats); |
| |
| g.add_const_data(nir->constant_data, nir->constant_data_size); |
| |
| return g.get_assembly(); |
| } |