blob: 0dfe8ab39b9d91b8b386de2ba85ea2324b3947ea [file] [log] [blame]
/*
* Copyright © 2021 Valve Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
*/
#include "ac_nir.h"
#include "nir_builder.h"
/*
* These NIR passes are used to lower NIR cross-stage I/O intrinsics into the
* memory accesses that actually happen on the HW.
*
* Each input and output has a 16-byte (4 dwords) slot reserved for it, and
* can have up to 4 components. Each component is 32 bits.
*
* ## VS-TCS-TES I/O - Terminology:
*
* * patch - Group of vertices, used instead of primitives in tessellation
* * per-vertex - input or output which can be different for every vertex.
* * per-patch - input output which applies to a patch (a group of vertices)
*
* ## VS-TCS-TES I/O - How it works:
*
* ```
* SW model: SW VS SW TCS tessellator SW TES
* ┊ ┊ ┊ ┊
* ┌────┐ ┌────┐ ┌────┐ ┌─────┐
* HW pipeline: │ LS │─╮ ╭─>│ HS │─╮ ╭─>│ FF │ ╭─>│VS/ES│
* └────┘ │ │ └────┘ │ │ └────┘ │ └─────┘
* Memory: ╰─>LDS<──╯ ╰─>VRAM───────╯
* ```
*
* * SW VS runs as a HW LS (Local Shader, merged into HS on GFX9+),
* and SW TCS runs as HW HS (Hull Shader).
* SW TES runs as either HW VS or HW ES (Export Shader).
* * LS and HS share the same LDS space.
* * LS (SW VS) stores outputs to LDS to be read by HS (SW TCS).
* * HS (SW TCS) stores outputs in LDS if the HS (SW TCS) reads them.
* * HS (SW TCS) stores outputs in VRAM if the next stage (SW TES) reads them.
*
* Side note: some old HW supports having TES read from the same LDS space where LS/HS write, but
* Mesa always stores HS outputs to VRAM to avoid forcing TES waves to run on the same CU as the LS/HS waves.
*
* ### Passing VS-TCS I/O in registers
*
* On GPUs that run SW VS and SW TCS on the same HW stage (HS on GFX9+),
* IO can be passed through registers instead of LDS when the following conditions are met:
*
* 1. TCS input and output patch size match
* 2. Floating point execution modes in SW VS and SW TCS match
* 3. The SW VS output is not written indirectly, and the corresponding SW TCS input is not read indirectly
*
* Some HS outputs could be passed through registers to, but this is a TODO.
*
* ### LDS layout used by VS-TCS:
*
* ```
* TCS per-vertex inputs for patch 0 <─── 0
* TCS per-vertex inputs for patch 1
* TCS per-vertex inputs for patch 2 <─── hs_per_vertex_input_lds_offset (rel_patch_id = 2)
* ...
* TCS per-vertex outputs for patch 0 <─── output_patch0_offset
* TCS per-patch outputs for patch 0 <─── output_patch0_patch_data_offset
* TCS per-vertex outputs for patch 1
* TCS per-patch outputs for patch 1
* TCS per-vertex outputs for patch 2 <─── hs_output_lds_offset (rel_patch_id = 2, per-vertex)
* TCS per-patch outputs for patch 2 <─── hs_output_lds_offset (rel_patch_id = 2, per-patch)
* ...
* ```
*
* ### VRAM layout used by TCS-TES I/O:
*
* ```
* attr 0 of patch 0 vertex 0 <─── "off-chip LDS" offset
* attr 0 of patch 0 vertex 1
* attr 0 of patch 0 vertex 2
* ...
* attr 0 of patch 1 vertex 0
* attr 0 of patch 1 vertex 1
* attr 0 of patch 1 vertex 2 <─── hs_per_vertex_output_vmem_offset (attribute slot = 0, rel_patch_id = 1, vertex index = 1)
* ...
* attr 0 of patch 2 vertex 0
* attr 0 of patch 2 vertex 1
* attr 0 of patch 2 vertex 2
* ...
* attr 1 of patch 0 vertex 0
* attr 1 of patch 0 vertex 1
* attr 1 of patch 0 vertex 2
* ...
* ...
* per-patch attr 0 of patch 0 <─── hs_out_patch_data_offset_amd
* per-patch attr 0 of patch 1
* per-patch attr 0 of patch 2 <─── hs_per_patch_output_vmem_offset (attribute slot = 0, rel_patch_id = 2)
* ...
* per-patch attr 1 of patch 0
* per-patch attr 1 of patch 1
* per-patch attr 1 of patch 2
* ...
* ```
*
*/
typedef struct {
/* Which hardware generation we're dealing with */
enum amd_gfx_level gfx_level;
/* I/O semantic -> real location used by lowering. */
ac_nir_map_io_driver_location map_io;
/* True if merged VS+TCS (on GFX9+) has the same number
* of input and output patch size.
*/
bool tcs_in_out_eq;
/* Bit mask of TCS per-vertex inputs (VS outputs) which
* are passed between the two stages only in temporaries (registers).
*/
uint64_t tcs_temp_only_inputs;
/* Bit mask of TCS outputs read by TES. */
uint64_t tes_inputs_read;
uint64_t tes_patch_inputs_read;
/* Whether TES reads the tess factors. */
bool tes_reads_tessfactors;
unsigned tcs_num_reserved_outputs;
unsigned tcs_num_reserved_patch_outputs;
/* Location (slot) where tessellation levels are stored. */
unsigned tcs_tess_lvl_in_loc;
unsigned tcs_tess_lvl_out_loc;
/* True if the output patch fits the subgroup, so all TCS outputs are always written in the same
* subgroup that reads them.
*/
bool tcs_out_patch_fits_subgroup;
/* Set if all invocations will write to all tess factors, so tess factors
* can be passed by register.
*/
bool tcs_pass_tessfactors_by_reg;
/* Whether all TCS inputs are accessed using gl_InvocationID and passed via VGPRs.
* In that case, no LDS is allocated for TCS inputs.
*/
bool tcs_no_inputs_in_lds;
} lower_tess_io_state;
static bool
match_mask(gl_shader_stage stage,
nir_intrinsic_instr *intrin,
uint64_t mask,
bool match_indirect)
{
bool indirect = !nir_src_is_const(*nir_get_io_offset_src(intrin));
if (indirect)
return match_indirect;
uint64_t slot = nir_intrinsic_io_semantics(intrin).location;
if (stage == MESA_SHADER_TESS_CTRL &&
intrin->intrinsic != nir_intrinsic_load_per_vertex_input &&
intrin->intrinsic != nir_intrinsic_store_per_vertex_output)
slot -= VARYING_SLOT_PATCH0;
return (UINT64_C(1) << slot) & mask;
}
static bool
tcs_output_needs_vmem(nir_intrinsic_instr *intrin,
lower_tess_io_state *st)
{
uint64_t mask = intrin->intrinsic == nir_intrinsic_store_per_vertex_output
? st->tes_inputs_read
: st->tes_patch_inputs_read;
return match_mask(MESA_SHADER_TESS_CTRL, intrin, mask, true);
}
static bool
tcs_output_needs_lds(nir_intrinsic_instr *intrin,
nir_shader *shader)
{
uint64_t mask = intrin->intrinsic == nir_intrinsic_store_per_vertex_output
? shader->info.outputs_read
: shader->info.patch_outputs_read;
return match_mask(MESA_SHADER_TESS_CTRL, intrin, mask, true);
}
static bool
lower_ls_output_store(nir_builder *b,
nir_instr *instr,
void *state)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_store_output)
return false;
/* The ARB_shader_viewport_layer_array spec contains the
* following issue:
*
* 2) What happens if gl_ViewportIndex or gl_Layer is
* written in the vertex shader and a geometry shader is
* present?
*
* RESOLVED: The value written by the last vertex processing
* stage is used. If the last vertex processing stage
* (vertex, tessellation evaluation or geometry) does not
* statically assign to gl_ViewportIndex or gl_Layer, index
* or layer zero is assumed.
*
* So writes to those outputs in VS-as-LS are simply ignored.
*/
unsigned semantic = nir_intrinsic_io_semantics(intrin).location;
if (semantic == VARYING_SLOT_LAYER || semantic == VARYING_SLOT_VIEWPORT) {
nir_instr_remove(instr);
return true;
}
lower_tess_io_state *st = (lower_tess_io_state *) state;
/* If this is a temp-only TCS input, we don't need to use shared memory at all. */
if (match_mask(MESA_SHADER_VERTEX, intrin, st->tcs_temp_only_inputs, false))
return false;
b->cursor = nir_before_instr(instr);
nir_ssa_def *vertex_idx = nir_load_local_invocation_index(b);
nir_ssa_def *base_off_var = nir_imul(b, vertex_idx, nir_load_lshs_vertex_stride_amd(b));
nir_ssa_def *io_off = ac_nir_calc_io_offset(b, intrin, nir_imm_int(b, 16u), 4u, st->map_io);
unsigned write_mask = nir_intrinsic_write_mask(intrin);
nir_ssa_def *off = nir_iadd_nuw(b, base_off_var, io_off);
nir_store_shared(b, intrin->src[0].ssa, off, .write_mask = write_mask,
.align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
/* NOTE: don't remove the store_output intrinsic on GFX9+ when tcs_in_out_eq,
* it will be used by same-invocation TCS input loads.
*/
if (!st->tcs_in_out_eq)
nir_instr_remove(instr);
return true;
}
static bool
filter_load_tcs_per_vertex_input(const nir_instr *instr,
UNUSED const void *state)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
lower_tess_io_state *st = (lower_tess_io_state *) state;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_per_vertex_input)
return false;
if (!st->tcs_in_out_eq)
return true;
/* tcs_in_out_eq: a same-invocation input load, without indirect offset,
* can use temporaries, no need to use shared memory.
*/
nir_src *off_src = nir_get_io_offset_src(intrin);
nir_src *vertex_index_src = nir_get_io_arrayed_index_src(intrin);
nir_instr *vertex_index_instr = vertex_index_src->ssa->parent_instr;
bool can_use_temps = nir_src_is_const(*off_src) &&
vertex_index_instr->type == nir_instr_type_intrinsic &&
nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
return !can_use_temps;
}
static nir_ssa_def *
hs_per_vertex_input_lds_offset(nir_builder *b,
lower_tess_io_state *st,
nir_intrinsic_instr *instr)
{
nir_ssa_def *tcs_in_vtxcnt = nir_load_patch_vertices_in(b);
nir_ssa_def *rel_patch_id = nir_load_tess_rel_patch_id_amd(b);
nir_ssa_def *vertex_index = nir_get_io_arrayed_index_src(instr)->ssa;
nir_ssa_def *stride = nir_load_lshs_vertex_stride_amd(b);
nir_ssa_def *tcs_in_patch_stride = nir_imul(b, tcs_in_vtxcnt, stride);
nir_ssa_def *vertex_index_off = nir_imul(b, vertex_index, stride);
nir_ssa_def *tcs_in_current_patch_offset = nir_imul(b, rel_patch_id, tcs_in_patch_stride);
nir_ssa_def *io_offset = ac_nir_calc_io_offset(b, instr, nir_imm_int(b, 16u), 4u, st->map_io);
return nir_iadd_nuw(b, nir_iadd_nuw(b, tcs_in_current_patch_offset, vertex_index_off), io_offset);
}
static nir_ssa_def *
hs_output_lds_offset(nir_builder *b,
lower_tess_io_state *st,
nir_intrinsic_instr *intrin)
{
bool per_vertex = intrin &&
(intrin->intrinsic == nir_intrinsic_store_per_vertex_output ||
intrin->intrinsic == nir_intrinsic_load_per_vertex_output);
unsigned output_vertex_size = st->tcs_num_reserved_outputs * 16u;
unsigned pervertex_output_patch_size = b->shader->info.tess.tcs_vertices_out * output_vertex_size;
unsigned output_patch_stride = pervertex_output_patch_size + st->tcs_num_reserved_patch_outputs * 16u;
nir_ssa_def *off = intrin
? ac_nir_calc_io_offset(b, intrin, nir_imm_int(b, 16u), 4u, st->map_io)
: nir_imm_int(b, 0);
nir_ssa_def *rel_patch_id = nir_load_tess_rel_patch_id_amd(b);
nir_ssa_def *patch_offset = nir_imul_imm(b, rel_patch_id, output_patch_stride);
nir_ssa_def *output_patch_offset;
if (st->tcs_no_inputs_in_lds)
output_patch_offset = patch_offset;
else {
nir_ssa_def *tcs_in_vtxcnt = nir_load_patch_vertices_in(b);
nir_ssa_def *tcs_num_patches = nir_load_tcs_num_patches_amd(b);
nir_ssa_def *input_patch_size =
nir_imul(b, tcs_in_vtxcnt, nir_load_lshs_vertex_stride_amd(b));
nir_ssa_def *output_patch0_offset = nir_imul(b, input_patch_size, tcs_num_patches);
output_patch_offset = nir_iadd_nuw(b, patch_offset, output_patch0_offset);
}
if (per_vertex) {
nir_ssa_def *vertex_index = nir_ssa_for_src(b, *nir_get_io_arrayed_index_src(intrin), 1);
nir_ssa_def *vertex_index_off = nir_imul_imm(b, vertex_index, output_vertex_size);
off = nir_iadd_nuw(b, off, vertex_index_off);
return nir_iadd_nuw(b, off, output_patch_offset);
} else {
off = nir_iadd_imm_nuw(b, off, pervertex_output_patch_size);
return nir_iadd_nuw(b, off, output_patch_offset);
}
}
static nir_ssa_def *
hs_per_vertex_output_vmem_offset(nir_builder *b,
lower_tess_io_state *st,
nir_intrinsic_instr *intrin)
{
nir_ssa_def *out_vertices_per_patch = b->shader->info.stage == MESA_SHADER_TESS_CTRL
? nir_imm_int(b, b->shader->info.tess.tcs_vertices_out)
: nir_load_patch_vertices_in(b);
nir_ssa_def *tcs_num_patches = nir_load_tcs_num_patches_amd(b);
nir_ssa_def *attr_stride = nir_imul(b, tcs_num_patches, nir_imul_imm(b, out_vertices_per_patch, 16u));
nir_ssa_def *io_offset = ac_nir_calc_io_offset(b, intrin, attr_stride, 4u, st->map_io);
nir_ssa_def *rel_patch_id = nir_load_tess_rel_patch_id_amd(b);
nir_ssa_def *patch_offset = nir_imul(b, rel_patch_id, nir_imul_imm(b, out_vertices_per_patch, 16u));
nir_ssa_def *vertex_index = nir_ssa_for_src(b, *nir_get_io_arrayed_index_src(intrin), 1);
nir_ssa_def *vertex_index_off = nir_imul_imm(b, vertex_index, 16u);
return nir_iadd_nuw(b, nir_iadd_nuw(b, patch_offset, vertex_index_off), io_offset);
}
static nir_ssa_def *
hs_per_patch_output_vmem_offset(nir_builder *b,
lower_tess_io_state *st,
nir_intrinsic_instr *intrin,
unsigned const_base_offset)
{
nir_ssa_def *tcs_num_patches = nir_load_tcs_num_patches_amd(b);
nir_ssa_def *per_patch_data_offset = nir_load_hs_out_patch_data_offset_amd(b);
nir_ssa_def * off = intrin
? ac_nir_calc_io_offset(b, intrin, nir_imul_imm(b, tcs_num_patches, 16u), 4u, st->map_io)
: nir_imm_int(b, 0);
if (const_base_offset)
off = nir_iadd_nuw(b, off, nir_imul_imm(b, tcs_num_patches, const_base_offset));
nir_ssa_def *rel_patch_id = nir_load_tess_rel_patch_id_amd(b);
nir_ssa_def *patch_offset = nir_imul_imm(b, rel_patch_id, 16u);
off = nir_iadd_nuw(b, off, per_patch_data_offset);
return nir_iadd_nuw(b, off, patch_offset);
}
static nir_ssa_def *
lower_hs_per_vertex_input_load(nir_builder *b,
nir_instr *instr,
void *state)
{
lower_tess_io_state *st = (lower_tess_io_state *) state;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
nir_ssa_def *off = hs_per_vertex_input_lds_offset(b, st, intrin);
return nir_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off,
.align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
}
static nir_ssa_def *
lower_hs_output_store(nir_builder *b,
nir_intrinsic_instr *intrin,
lower_tess_io_state *st)
{
assert(intrin->intrinsic == nir_intrinsic_store_per_vertex_output ||
intrin->intrinsic == nir_intrinsic_store_output);
nir_io_semantics semantics = nir_intrinsic_io_semantics(intrin);
nir_ssa_def *store_val = intrin->src[0].ssa;
unsigned write_mask = nir_intrinsic_write_mask(intrin);
bool is_tess_factor = semantics.location == VARYING_SLOT_TESS_LEVEL_INNER ||
semantics.location == VARYING_SLOT_TESS_LEVEL_OUTER;
bool write_to_vmem = !is_tess_factor && tcs_output_needs_vmem(intrin, st);
bool write_to_lds = (is_tess_factor && !st->tcs_pass_tessfactors_by_reg) ||
tcs_output_needs_lds(intrin, b->shader);
if (write_to_vmem) {
nir_ssa_def *vmem_off = intrin->intrinsic == nir_intrinsic_store_per_vertex_output
? hs_per_vertex_output_vmem_offset(b, st, intrin)
: hs_per_patch_output_vmem_offset(b, st, intrin, 0);
nir_ssa_def *hs_ring_tess_offchip = nir_load_ring_tess_offchip_amd(b);
nir_ssa_def *offchip_offset = nir_load_ring_tess_offchip_offset_amd(b);
nir_ssa_def *zero = nir_imm_int(b, 0);
nir_store_buffer_amd(b, store_val, hs_ring_tess_offchip, vmem_off, offchip_offset, zero,
.write_mask = write_mask, .memory_modes = nir_var_shader_out,
.access = ACCESS_COHERENT);
}
if (write_to_lds) {
/* Remember driver location of tess factors, so we can read them later */
if (semantics.location == VARYING_SLOT_TESS_LEVEL_INNER)
st->tcs_tess_lvl_in_loc = nir_intrinsic_base(intrin) * 16u;
else if (semantics.location == VARYING_SLOT_TESS_LEVEL_OUTER)
st->tcs_tess_lvl_out_loc = nir_intrinsic_base(intrin) * 16u;
nir_ssa_def *lds_off = hs_output_lds_offset(b, st, intrin);
nir_store_shared(b, store_val, lds_off, .write_mask = write_mask,
.align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
}
/* Keep tess factor nir_store_output instruction if it's going to be passed
* by reg instead of LDS, because it's used by radeonsi llvm backend to generate
* llvm variable which is read by the final llvm tess factor write epilog.
*/
return is_tess_factor && st->tcs_pass_tessfactors_by_reg ?
NIR_LOWER_INSTR_PROGRESS : NIR_LOWER_INSTR_PROGRESS_REPLACE;
}
static nir_ssa_def *
lower_hs_output_load(nir_builder *b,
nir_intrinsic_instr *intrin,
lower_tess_io_state *st)
{
nir_ssa_def *off = hs_output_lds_offset(b, st, intrin);
return nir_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off,
.align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
}
static void
update_hs_scoped_barrier(nir_intrinsic_instr *intrin, lower_tess_io_state *st)
{
/* Output loads and stores are lowered to shared memory access,
* so we have to update the barriers to also reflect this.
*/
unsigned mem_modes = nir_intrinsic_memory_modes(intrin);
if (mem_modes & nir_var_shader_out)
mem_modes |= nir_var_mem_shared;
nir_intrinsic_set_memory_modes(intrin, mem_modes);
nir_scope exec_scope = nir_intrinsic_execution_scope(intrin);
if (exec_scope == NIR_SCOPE_WORKGROUP && st->tcs_out_patch_fits_subgroup)
nir_intrinsic_set_execution_scope(intrin, NIR_SCOPE_SUBGROUP);
nir_scope mem_scope = nir_intrinsic_memory_scope(intrin);
if (mem_scope == NIR_SCOPE_WORKGROUP && st->tcs_out_patch_fits_subgroup)
nir_intrinsic_set_memory_scope(intrin, NIR_SCOPE_SUBGROUP);
}
static nir_ssa_def *
lower_hs_output_access(nir_builder *b,
nir_instr *instr,
void *state)
{
lower_tess_io_state *st = (lower_tess_io_state *) state;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic == nir_intrinsic_store_output ||
intrin->intrinsic == nir_intrinsic_store_per_vertex_output) {
return lower_hs_output_store(b, intrin, st);
} else if (intrin->intrinsic == nir_intrinsic_load_output ||
intrin->intrinsic == nir_intrinsic_load_per_vertex_output) {
return lower_hs_output_load(b, intrin, st);
} else if (intrin->intrinsic == nir_intrinsic_scoped_barrier) {
update_hs_scoped_barrier(intrin, st);
return NIR_LOWER_INSTR_PROGRESS;
} else {
unreachable("intrinsic not supported by lower_hs_output_access");
}
}
static void
hs_emit_write_tess_factors(nir_shader *shader,
lower_tess_io_state *st)
{
unsigned outer_comps;
unsigned inner_comps;
switch (shader->info.tess._primitive_mode) {
case TESS_PRIMITIVE_ISOLINES:
outer_comps = 2;
inner_comps = 0;
break;
case TESS_PRIMITIVE_TRIANGLES:
outer_comps = 3;
inner_comps = 1;
break;
case TESS_PRIMITIVE_QUADS:
outer_comps = 4;
inner_comps = 2;
break;
default:
unreachable("invalid primitive mode");
return;
}
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
assert(impl);
nir_block *last_block = nir_impl_last_block(impl);
assert(last_block);
/* We assume there is always a single end block in the shader. */
nir_builder builder;
nir_builder *b = &builder; /* This is to avoid the & */
nir_builder_init(b, impl);
b->cursor = nir_after_block(last_block);
nir_scope scope =
st->tcs_out_patch_fits_subgroup ? NIR_SCOPE_SUBGROUP : NIR_SCOPE_WORKGROUP;
nir_scoped_barrier(b, .execution_scope = scope, .memory_scope = scope,
.memory_semantics = NIR_MEMORY_ACQ_REL, .memory_modes = nir_var_mem_shared);
nir_ssa_def *invocation_id = nir_load_invocation_id(b);
/* Only the 1st invocation of each patch needs to do this. */
nir_if *invocation_id_zero = nir_push_if(b, nir_ieq_imm(b, invocation_id, 0));
/* When the output patch size is <= 32 then we can flatten the branch here
* because we know for sure that at least 1 invocation in all waves will
* take the branch.
*/
if (shader->info.tess.tcs_vertices_out <= 32)
invocation_id_zero->control = nir_selection_control_divergent_always_taken;
/* The descriptor where tess factors have to be stored by the shader. */
nir_ssa_def *tessfactor_ring = nir_load_ring_tess_factors_amd(b);
/* Base LDS address of per-patch outputs in the current patch. */
nir_ssa_def *lds_base = hs_output_lds_offset(b, st, NULL);
/* Load all tessellation factors (aka. tess levels) from LDS. */
nir_ssa_def *tessfactors_outer = nir_load_shared(b, outer_comps, 32, lds_base, .base = st->tcs_tess_lvl_out_loc,
.align_mul = 16u, .align_offset = st->tcs_tess_lvl_out_loc % 16u);
nir_ssa_def *tessfactors_inner = inner_comps
? nir_load_shared(b, inner_comps, 32, lds_base, .base = st->tcs_tess_lvl_in_loc,
.align_mul = 16u, .align_offset = st->tcs_tess_lvl_in_loc % 16u)
: NULL;
nir_ssa_def *zero = nir_imm_int(b, 0);
nir_ssa_def *rel_patch_id = nir_load_tess_rel_patch_id_amd(b);
nir_ssa_def *tess_factors_base = nir_load_ring_tess_factors_offset_amd(b);
nir_ssa_def *tess_factors_offset = nir_imul_imm(b, rel_patch_id, (inner_comps + outer_comps) * 4u);
unsigned tess_factors_const_offset = 0;
if (st->gfx_level <= GFX8) {
/* Store the dynamic HS control word. */
nir_if *rel_patch_id_zero = nir_push_if(b, nir_ieq_imm(b, rel_patch_id, 0));
nir_ssa_def *ctrlw = nir_imm_int(b, 0x80000000u);
nir_store_buffer_amd(b, ctrlw, tessfactor_ring, zero, tess_factors_base, zero,
.access = ACCESS_COHERENT);
tess_factors_const_offset += 4;
nir_pop_if(b, rel_patch_id_zero);
}
/* Store tess factors for the tessellator */
if (shader->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES) {
/* LINES reversal */
nir_ssa_def *t = nir_vec2(b, nir_channel(b, tessfactors_outer, 1), nir_channel(b, tessfactors_outer, 0));
nir_store_buffer_amd(b, t, tessfactor_ring, tess_factors_offset, tess_factors_base, zero,
.base = tess_factors_const_offset, .access = ACCESS_COHERENT);
} else if (shader->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES) {
nir_ssa_def *t = nir_vec4(b, nir_channel(b, tessfactors_outer, 0), nir_channel(b, tessfactors_outer, 1),
nir_channel(b, tessfactors_outer, 2), nir_channel(b, tessfactors_inner, 0));
nir_store_buffer_amd(b, t, tessfactor_ring, tess_factors_offset, tess_factors_base, zero,
.base = tess_factors_const_offset, .access = ACCESS_COHERENT);
} else {
nir_store_buffer_amd(b, tessfactors_outer, tessfactor_ring, tess_factors_offset, tess_factors_base, zero,
.base = tess_factors_const_offset, .access = ACCESS_COHERENT);
nir_store_buffer_amd(b, tessfactors_inner, tessfactor_ring, tess_factors_offset, tess_factors_base, zero,
.base = tess_factors_const_offset + 4u * outer_comps, .access = ACCESS_COHERENT);
}
if (st->tes_reads_tessfactors) {
/* Store to offchip for TES to read - only if TES actually reads them */
nir_ssa_def *hs_ring_tess_offchip = nir_load_ring_tess_offchip_amd(b);
nir_ssa_def *offchip_offset = nir_load_ring_tess_offchip_offset_amd(b);
nir_ssa_def *vmem_off_outer = hs_per_patch_output_vmem_offset(b, st, NULL, st->tcs_tess_lvl_out_loc);
nir_store_buffer_amd(b, tessfactors_outer, hs_ring_tess_offchip, vmem_off_outer, offchip_offset, zero,
.memory_modes = nir_var_shader_out, .access = ACCESS_COHERENT);
if (inner_comps) {
nir_ssa_def *vmem_off_inner = hs_per_patch_output_vmem_offset(b, st, NULL, st->tcs_tess_lvl_in_loc);
nir_store_buffer_amd(b, tessfactors_inner, hs_ring_tess_offchip, vmem_off_inner, offchip_offset, zero,
.memory_modes = nir_var_shader_out, .access = ACCESS_COHERENT);
}
}
nir_pop_if(b, invocation_id_zero);
nir_metadata_preserve(impl, nir_metadata_none);
}
static nir_ssa_def *
lower_tes_input_load(nir_builder *b,
nir_instr *instr,
void *state)
{
lower_tess_io_state *st = (lower_tess_io_state *) state;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
nir_ssa_def *offchip_ring = nir_load_ring_tess_offchip_amd(b);
nir_ssa_def *offchip_offset = nir_load_ring_tess_offchip_offset_amd(b);
nir_ssa_def *off = intrin->intrinsic == nir_intrinsic_load_per_vertex_input
? hs_per_vertex_output_vmem_offset(b, st, intrin)
: hs_per_patch_output_vmem_offset(b, st, intrin, 0);
nir_ssa_def *zero = nir_imm_int(b, 0);
return nir_load_buffer_amd(b, intrin->dest.ssa.num_components,
intrin->dest.ssa.bit_size, offchip_ring,
off, offchip_offset, zero,
.access = ACCESS_COHERENT);
}
static bool
filter_hs_output_access(const nir_instr *instr,
UNUSED const void *st)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
return intrin->intrinsic == nir_intrinsic_store_output ||
intrin->intrinsic == nir_intrinsic_store_per_vertex_output ||
intrin->intrinsic == nir_intrinsic_load_output ||
intrin->intrinsic == nir_intrinsic_load_per_vertex_output ||
intrin->intrinsic == nir_intrinsic_scoped_barrier;
}
static bool
filter_any_input_access(const nir_instr *instr,
UNUSED const void *st)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
return intrin->intrinsic == nir_intrinsic_load_input ||
intrin->intrinsic == nir_intrinsic_load_per_vertex_input;
}
void
ac_nir_lower_ls_outputs_to_mem(nir_shader *shader,
ac_nir_map_io_driver_location map,
bool tcs_in_out_eq,
uint64_t tcs_temp_only_inputs)
{
assert(shader->info.stage == MESA_SHADER_VERTEX);
lower_tess_io_state state = {
.tcs_in_out_eq = tcs_in_out_eq,
.tcs_temp_only_inputs = tcs_in_out_eq ? tcs_temp_only_inputs : 0,
.map_io = map,
};
nir_shader_instructions_pass(shader,
lower_ls_output_store,
nir_metadata_block_index | nir_metadata_dominance,
&state);
}
void
ac_nir_lower_hs_inputs_to_mem(nir_shader *shader,
ac_nir_map_io_driver_location map,
bool tcs_in_out_eq)
{
assert(shader->info.stage == MESA_SHADER_TESS_CTRL);
lower_tess_io_state state = {
.tcs_in_out_eq = tcs_in_out_eq,
.map_io = map,
};
nir_shader_lower_instructions(shader,
filter_load_tcs_per_vertex_input,
lower_hs_per_vertex_input_load,
&state);
}
void
ac_nir_lower_hs_outputs_to_mem(nir_shader *shader,
ac_nir_map_io_driver_location map,
enum amd_gfx_level gfx_level,
bool tes_reads_tessfactors,
uint64_t tes_inputs_read,
uint64_t tes_patch_inputs_read,
unsigned num_reserved_tcs_outputs,
unsigned num_reserved_tcs_patch_outputs,
unsigned wave_size,
bool no_inputs_in_lds,
bool pass_tessfactors_by_reg,
bool emit_tess_factor_write)
{
assert(shader->info.stage == MESA_SHADER_TESS_CTRL);
lower_tess_io_state state = {
.gfx_level = gfx_level,
.tes_reads_tessfactors = tes_reads_tessfactors,
.tes_inputs_read = tes_inputs_read,
.tes_patch_inputs_read = tes_patch_inputs_read,
.tcs_num_reserved_outputs = num_reserved_tcs_outputs,
.tcs_num_reserved_patch_outputs = num_reserved_tcs_patch_outputs,
.tcs_out_patch_fits_subgroup = wave_size % shader->info.tess.tcs_vertices_out == 0,
.tcs_pass_tessfactors_by_reg = pass_tessfactors_by_reg,
.tcs_no_inputs_in_lds = no_inputs_in_lds,
.map_io = map,
};
nir_shader_lower_instructions(shader,
filter_hs_output_access,
lower_hs_output_access,
&state);
if (emit_tess_factor_write)
hs_emit_write_tess_factors(shader, &state);
}
void
ac_nir_lower_tes_inputs_to_mem(nir_shader *shader,
ac_nir_map_io_driver_location map)
{
assert(shader->info.stage == MESA_SHADER_TESS_EVAL);
lower_tess_io_state state = {
.map_io = map,
};
nir_shader_lower_instructions(shader,
filter_any_input_access,
lower_tes_input_load,
&state);
}