blob: 3493c7b53dbffbad5269eecd3e7c48016b553569 [file] [log] [blame]
/*
* Copyright 2024 Advanced Micro Devices, Inc.
*
* SPDX-License-Identifier: MIT
*/
/**
* This pass:
* - vectorizes lowered input/output loads and stores
* - vectorizes low and high 16-bit loads and stores by merging them into
* a single 32-bit load or store (except load_interpolated_input and XFB,
* which have to keep bit_size=16)
* - performs DCE of output stores that overwrite the previous value by writing
* into the same slot and component.
*
* Vectorization is only local within basic blocks. No vectorization occurs
* across basic block boundaries, barriers (only TCS outputs), emits (only
* GS outputs), and output load <-> output store dependencies.
*
* All loads and stores must be scalar. 64-bit loads and stores are forbidden.
*
* For each basic block, the time complexity is O(n*log(n)) where n is
* the number of IO instructions within that block.
*/
#include "util/u_dynarray.h"
#include "nir.h"
#include "nir_builder.h"
/* Return 0 if loads/stores are vectorizable. Return 1 or -1 to define
* an ordering between non-vectorizable instructions. This is used by qsort,
* to sort all gathered instructions into groups of vectorizable instructions.
*/
static int
compare_is_not_vectorizable(nir_intrinsic_instr *a, nir_intrinsic_instr *b)
{
if (a->intrinsic != b->intrinsic)
return a->intrinsic > b->intrinsic ? 1 : -1;
nir_src *offset0 = nir_get_io_offset_src(a);
nir_src *offset1 = nir_get_io_offset_src(b);
if (offset0 && offset0->ssa != offset1->ssa)
return offset0->ssa->index > offset1->ssa->index ? 1 : -1;
nir_src *array_idx0 = nir_get_io_arrayed_index_src(a);
nir_src *array_idx1 = nir_get_io_arrayed_index_src(b);
if (array_idx0 && array_idx0->ssa != array_idx1->ssa)
return array_idx0->ssa->index > array_idx1->ssa->index ? 1 : -1;
/* Compare barycentrics or vertex index. */
if ((a->intrinsic == nir_intrinsic_load_interpolated_input ||
a->intrinsic == nir_intrinsic_load_input_vertex) &&
a->src[0].ssa != b->src[0].ssa)
return a->src[0].ssa->index > b->src[0].ssa->index ? 1 : -1;
nir_io_semantics sem0 = nir_intrinsic_io_semantics(a);
nir_io_semantics sem1 = nir_intrinsic_io_semantics(b);
if (sem0.location != sem1.location)
return sem0.location > sem1.location ? 1 : -1;
/* The mediump flag isn't mergable. */
if (sem0.medium_precision != sem1.medium_precision)
return sem0.medium_precision > sem1.medium_precision ? 1 : -1;
/* Don't merge per-view attributes with non-per-view attributes. */
if (sem0.per_view != sem1.per_view)
return sem0.per_view > sem1.per_view ? 1 : -1;
if (sem0.interp_explicit_strict != sem1.interp_explicit_strict)
return sem0.interp_explicit_strict > sem1.interp_explicit_strict ? 1 : -1;
/* Only load_interpolated_input can't merge low and high halves of 16-bit
* loads/stores.
*/
if (a->intrinsic == nir_intrinsic_load_interpolated_input &&
sem0.high_16bits != sem1.high_16bits)
return sem0.high_16bits > sem1.high_16bits ? 1 : -1;
/* TODO: vectorize (f32, f32, f16vec2, f16vec2) -> vec4
* For now, different bit sizes are not vectorized together.
*/
if (nir_intrinsic_has_src_type(a)) {
/* Stores. */
if (a->src[0].ssa->bit_size != b->src[0].ssa->bit_size)
return a->src[0].ssa->bit_size > b->src[0].ssa->bit_size ? 1 : -1;
} else {
/* Loads. */
if (a->def.bit_size != b->def.bit_size)
return a->def.bit_size > b->def.bit_size ? 1 : -1;
}
nir_shader *shader =
nir_cf_node_get_function(&a->instr.block->cf_node)->function->shader;
/* Compare the types. */
if (!(shader->options->io_options & nir_io_vectorizer_ignores_types)) {
unsigned type_a, type_b;
if (nir_intrinsic_has_src_type(a)) {
type_a = nir_intrinsic_src_type(a);
type_b = nir_intrinsic_src_type(b);
} else {
type_a = nir_intrinsic_dest_type(a);
type_b = nir_intrinsic_dest_type(b);
}
if (type_a != type_b)
return type_a > type_b ? 1 : -1;
}
return 0;
}
static int
compare_intr(const void *xa, const void *xb)
{
nir_intrinsic_instr *a = *(nir_intrinsic_instr **)xa;
nir_intrinsic_instr *b = *(nir_intrinsic_instr **)xb;
int comp = compare_is_not_vectorizable(a, b);
if (comp)
return comp;
/* qsort isn't stable. This ensures that later stores aren't moved before earlier stores. */
return a->instr.index > b->instr.index ? 1 : -1;
}
typedef enum {
merge_low_high_16_to_32,
vectorize_high_16_separately,
vectorize_the_rest,
} nir_vectorize_op_step;
static bool
apply_radv_workaround(nir_builder *b)
{
return b->shader->options->io_options &
nir_io_radv_intrinsic_component_workaround;
}
static void
vectorize_load(nir_intrinsic_instr *chan[8], unsigned start, unsigned count,
nir_vectorize_op_step step)
{
nir_intrinsic_instr *first = NULL;
/* Find the first instruction where the vectorized load will be
* inserted.
*/
for (unsigned i = start; i < start + count; i++) {
if (!chan[i])
continue;
first = !first || chan[i]->instr.index < first->instr.index ? chan[i] : first;
if (step == merge_low_high_16_to_32) {
first = !first || chan[4 + i]->instr.index < first->instr.index ? chan[4 + i] : first;
}
}
/* Insert the vectorized load. */
nir_builder b = nir_builder_at(nir_before_instr(&first->instr));
nir_intrinsic_instr *new_intr =
nir_intrinsic_instr_create(b.shader, first->intrinsic);
new_intr->num_components = count;
nir_def_init(&new_intr->instr, &new_intr->def, count,
step == merge_low_high_16_to_32 ? 32 : first->def.bit_size);
memcpy(new_intr->src, first->src,
nir_intrinsic_infos[first->intrinsic].num_srcs * sizeof(nir_src));
nir_intrinsic_copy_const_indices(new_intr, first);
if (apply_radv_workaround(&b))
nir_intrinsic_set_component(new_intr, start);
else
nir_intrinsic_set_component(new_intr, start & 0x3); /* Bits 4..7 should map to 0..3 */
assert(start % 4 + count <= 4);
nir_io_semantics sem = nir_intrinsic_io_semantics(new_intr);
if (step == vectorize_high_16_separately) {
assert(start >= 4);
sem.high_16bits = 1;
} else {
assert(start <= 3);
}
if (step == merge_low_high_16_to_32) {
sem.high_16bits = 0;
nir_intrinsic_set_dest_type(new_intr,
(nir_intrinsic_dest_type(new_intr) & ~16) | 32);
}
nir_intrinsic_set_io_semantics(new_intr, sem);
nir_builder_instr_insert(&b, &new_intr->instr);
nir_def *def = &new_intr->def;
/* Replace the scalar loads. */
if (step == merge_low_high_16_to_32) {
for (unsigned i = start; i < start + count; i++) {
nir_def *comp = nir_channel(&b, def, i - start);
nir_def_rewrite_uses(&chan[i]->def,
nir_unpack_32_2x16_split_x(&b, comp));
nir_def_rewrite_uses(&chan[4 + i]->def,
nir_unpack_32_2x16_split_y(&b, comp));
nir_instr_remove(&chan[i]->instr);
nir_instr_remove(&chan[4 + i]->instr);
}
} else {
for (unsigned i = start; i < start + count; i++) {
if (chan[i])
nir_def_replace(&chan[i]->def, nir_channel(&b, def, i - start));
}
}
}
static void
vectorize_store(nir_intrinsic_instr *chan[8], unsigned start, unsigned count,
nir_vectorize_op_step step)
{
nir_intrinsic_instr *last = NULL;
/* Find the last instruction where the vectorized store will be
* inserted.
*/
for (unsigned i = start; i < start + count; i++) {
last = !last || chan[i]->instr.index > last->instr.index ? chan[i] : last;
if (step == merge_low_high_16_to_32) {
last = !last || chan[4 + i]->instr.index > last->instr.index ? chan[4 + i] : last;
}
}
/* Change the last instruction to a vectorized store. Update xfb first
* because we need to read some info from "last" before overwriting it.
*/
if (nir_intrinsic_has_io_xfb(last)) {
/* 0 = low/full XY channels
* 1 = low/full ZW channels
* 2 = high XY channels
* 3 = high ZW channels
*/
nir_io_xfb xfb[4] = { { { { 0 } } } };
for (unsigned i = start; i < start + count; i++) {
xfb[i / 2].out[i % 2] =
((i % 4) < 2 ? nir_intrinsic_io_xfb(chan[i]) : nir_intrinsic_io_xfb2(chan[i])).out[i % 2];
/* Merging low and high 16 bits to 32 bits is not possible
* with xfb in some cases.
*/
assert(!xfb[i / 2].out[i % 2].num_components ||
step != merge_low_high_16_to_32);
}
/* Now vectorize xfb info by merging the individual elements. */
for (unsigned i = start; i < start + count; i++) {
/* mediump means that xfb upconverts to 32 bits when writing to
* memory.
*/
unsigned xfb_comp_size =
nir_intrinsic_io_semantics(chan[i]).medium_precision ? 32 : chan[i]->src[0].ssa->bit_size;
for (unsigned j = i + 1; j < start + count; j++) {
if (xfb[i / 2].out[i % 2].buffer != xfb[j / 2].out[j % 2].buffer ||
xfb[i / 2].out[i % 2].offset != xfb[j / 2].out[j % 2].offset +
xfb_comp_size * (j - i))
break;
xfb[i / 2].out[i % 2].num_components++;
memset(&xfb[j / 2].out[j % 2], 0, sizeof(xfb[j / 2].out[j % 2]));
}
}
if (start >= 4) {
nir_intrinsic_set_io_xfb(last, xfb[2]);
nir_intrinsic_set_io_xfb2(last, xfb[3]);
} else {
assert(start + count <= 4);
nir_intrinsic_set_io_xfb(last, xfb[0]);
nir_intrinsic_set_io_xfb2(last, xfb[1]);
}
}
/* Update gs_streams. */
unsigned gs_streams = 0;
for (unsigned i = start; i < start + count; i++) {
gs_streams |= (nir_intrinsic_io_semantics(chan[i]).gs_streams & 0x3) << ((i - start) * 2);
}
nir_io_semantics sem = nir_intrinsic_io_semantics(last);
sem.gs_streams = gs_streams;
if (step == vectorize_high_16_separately) {
assert(start >= 4);
sem.high_16bits = 1;
} else {
assert(start <= 3);
}
/* Update other flags. */
for (unsigned i = start; i < start + count; i++) {
if (!nir_intrinsic_io_semantics(chan[i]).no_sysval_output)
sem.no_sysval_output = 0;
if (!nir_intrinsic_io_semantics(chan[i]).no_varying)
sem.no_varying = 0;
}
if (step == merge_low_high_16_to_32) {
/* Update "no" flags for high bits. */
for (unsigned i = start; i < start + count; i++) {
if (!nir_intrinsic_io_semantics(chan[4 + i]).no_sysval_output)
sem.no_sysval_output = 0;
if (!nir_intrinsic_io_semantics(chan[4 + i]).no_varying)
sem.no_varying = 0;
}
/* Update the type. */
sem.high_16bits = 0;
nir_intrinsic_set_src_type(last,
(nir_intrinsic_src_type(last) & ~16) | 32);
}
/* TODO: Merge names? */
nir_builder b = nir_builder_at(nir_before_instr(&last->instr));
/* Update the rest. */
nir_intrinsic_set_io_semantics(last, sem);
if (apply_radv_workaround(&b))
nir_intrinsic_set_component(last, start);
else
nir_intrinsic_set_component(last, start & 0x3); /* Bits 4..7 should map to 0..3 */
assert(start % 4 + count <= 4);
nir_intrinsic_set_write_mask(last, BITFIELD_MASK(count));
last->num_components = count;
/* Replace the stored scalar with the vector. */
if (step == merge_low_high_16_to_32) {
nir_def *value[4];
for (unsigned i = start; i < start + count; i++) {
value[i] = nir_pack_32_2x16_split(&b, chan[i]->src[0].ssa,
chan[4 + i]->src[0].ssa);
}
nir_src_rewrite(&last->src[0], nir_vec(&b, &value[start], count));
} else {
nir_def *value[8];
for (unsigned i = start; i < start + count; i++)
value[i] = chan[i]->src[0].ssa;
nir_src_rewrite(&last->src[0], nir_vec(&b, &value[start], count));
}
/* Remove the scalar stores. */
for (unsigned i = start; i < start + count; i++) {
if (chan[i] != last)
nir_instr_remove(&chan[i]->instr);
if (step == merge_low_high_16_to_32 && chan[4 + i] != last)
nir_instr_remove(&chan[4 + i]->instr);
}
}
/* Vectorize a vector of scalar instructions. chan[8] are the channels.
* (the last 4 are the high 16-bit channels)
*/
static bool
vectorize_slot(nir_intrinsic_instr *chan[8], unsigned mask, bool allow_holes)
{
bool progress = false;
assert(mask);
bool is_load = nir_intrinsic_infos[chan[ffs(mask) - 1]->intrinsic].has_dest;
/* First, merge low and high 16-bit halves into 32 bits separately when
* possible. Then vectorize what's left.
*/
for (nir_vectorize_op_step step = merge_low_high_16_to_32;
step <= vectorize_the_rest; step++) {
unsigned scan_mask;
if (step == merge_low_high_16_to_32) {
/* Get the subset of the mask where both low and high bits are set. */
scan_mask = 0;
for (unsigned i = 0; i < 4; i++) {
unsigned low_high_bits = BITFIELD_BIT(i) | BITFIELD_BIT(i + 4);
if ((mask & low_high_bits) == low_high_bits) {
/* Merging low and high 16 bits to 32 bits is not possible
* with xfb in some cases.
*/
if (nir_intrinsic_has_io_xfb(chan[i])) {
unsigned hi = i + 4;
if ((i < 2 ? nir_intrinsic_io_xfb(chan[i])
: nir_intrinsic_io_xfb2(chan[i]))
.out[i % 2]
.num_components ||
(i < 2 ? nir_intrinsic_io_xfb(chan[hi])
: nir_intrinsic_io_xfb2(chan[hi]))
.out[i % 2]
.num_components)
continue;
}
/* The GS stream must be the same for both halves. */
if ((nir_intrinsic_io_semantics(chan[i]).gs_streams & 0x3) !=
(nir_intrinsic_io_semantics(chan[4 + i]).gs_streams & 0x3))
continue;
scan_mask |= BITFIELD_BIT(i);
mask &= ~low_high_bits;
}
}
} else if (step == vectorize_high_16_separately) {
scan_mask = mask & BITFIELD_RANGE(4, 4);
mask &= ~scan_mask;
if (is_load && allow_holes) {
unsigned num = util_last_bit(scan_mask);
scan_mask = BITFIELD_RANGE(4, num - 4);
}
} else {
scan_mask = mask;
if (is_load && allow_holes) {
unsigned num = util_last_bit(scan_mask);
scan_mask = BITFIELD_MASK(num);
}
}
while (scan_mask) {
int start, count;
u_bit_scan_consecutive_range(&scan_mask, &start, &count);
if (count == 1 && step != merge_low_high_16_to_32)
continue; /* There is nothing to vectorize. */
if (is_load)
vectorize_load(chan, start, count, step);
else
vectorize_store(chan, start, count, step);
progress = true;
}
}
return progress;
}
static bool
vectorize_batch(struct util_dynarray *io_instructions, bool allow_holes)
{
unsigned num_instr = util_dynarray_num_elements(io_instructions, void *);
/* We need to at least 2 instructions to have something to do. */
if (num_instr <= 1) {
/* Clear the array. The next block will reuse it. */
util_dynarray_clear(io_instructions);
return false;
}
/* The instructions are sorted such that groups of vectorizable
* instructions are next to each other. Multiple incompatible
* groups of vectorizable instructions can occur in this array.
* The reason why 2 groups would be incompatible is that they
* could have a different intrinsic, indirect index, array index,
* vertex index, barycentrics, or location. Each group is vectorized
* separately.
*
* This reorders instructions in the array, but not in the shader.
*/
qsort(io_instructions->data, num_instr, sizeof(void *), compare_intr);
nir_intrinsic_instr *chan[8] = { 0 }, *prev = NULL;
unsigned chan_mask = 0;
bool progress = false;
/* Vectorize all groups.
*
* The channels for each group are gathered. If 2 stores overwrite
* the same channel, the earlier store is DCE'd here.
*/
util_dynarray_foreach(io_instructions, nir_intrinsic_instr *, intr) {
/* If the next instruction is not vectorizable, vectorize what
* we have gathered so far.
*/
if (prev && compare_is_not_vectorizable(prev, *intr)) {
/* We need at least 2 instructions to have something to do. */
if (util_bitcount(chan_mask) > 1)
progress |= vectorize_slot(chan, chan_mask, allow_holes);
prev = NULL;
memset(chan, 0, sizeof(chan));
chan_mask = 0;
}
/* This performs DCE of output stores because the previous value
* is being overwritten.
*/
unsigned index = nir_intrinsic_io_semantics(*intr).high_16bits * 4 +
nir_intrinsic_component(*intr);
bool is_store = !nir_intrinsic_infos[(*intr)->intrinsic].has_dest;
if (is_store && chan[index])
nir_instr_remove(&chan[index]->instr);
/* Gather the channel. */
chan[index] = *intr;
prev = *intr;
chan_mask |= BITFIELD_BIT(index);
}
/* Vectorize the last group. */
if (prev && util_bitcount(chan_mask) > 1)
progress |= vectorize_slot(chan, chan_mask, allow_holes);
/* Clear the array. The next block will reuse it. */
util_dynarray_clear(io_instructions);
return progress;
}
/* Vectorize lowered IO (load_input/store_output/...).
*
* modes specifies whether to vectorize inputs and/or outputs.
*
* allow_holes enables vectorization of loads with holes, e.g.:
* load X; load W; ==> load XYZW;
*
* This is useful for VS input loads where it might not be possible to skip
* loading unused components, e.g. with AMD where loading W also loads XYZ,
* so if we also load X separately again, it's wasteful. It's better to get
* X from the vector that loads (XYZ)W.
*/
bool
nir_opt_vectorize_io(nir_shader *shader, nir_variable_mode modes,
bool allow_holes)
{
assert(!(modes & ~(nir_var_shader_in | nir_var_shader_out)));
if (shader->info.stage == MESA_SHADER_FRAGMENT &&
shader->options->io_options & nir_io_prefer_scalar_fs_inputs)
modes &= ~nir_var_shader_in;
if ((shader->info.stage == MESA_SHADER_TESS_CTRL ||
shader->info.stage == MESA_SHADER_GEOMETRY) &&
util_bitcount(modes) == 2) {
/* When vectorizing TCS and GS IO, inputs can ignore barriers and emits,
* but that is only done when outputs are ignored, so vectorize them
* separately.
*/
bool progress_in = nir_opt_vectorize_io(shader, nir_var_shader_in,
allow_holes);
bool progress_out = nir_opt_vectorize_io(shader, nir_var_shader_out,
allow_holes);
return progress_in || progress_out;
}
/* Initialize dynamic arrays. */
struct util_dynarray io_instructions;
util_dynarray_init(&io_instructions, NULL);
bool global_progress = false;
nir_foreach_function_impl(impl, shader) {
bool progress = false;
nir_metadata_require(impl, nir_metadata_instr_index);
nir_foreach_block(block, impl) {
BITSET_DECLARE(has_output_loads, NUM_TOTAL_VARYING_SLOTS * 8);
BITSET_DECLARE(has_output_stores, NUM_TOTAL_VARYING_SLOTS * 8);
BITSET_ZERO(has_output_loads);
BITSET_ZERO(has_output_stores);
/* Gather load/store intrinsics within the block. */
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
bool is_load = nir_intrinsic_infos[intr->intrinsic].has_dest;
bool is_output = false;
nir_io_semantics sem = { 0 };
unsigned index = 0;
if (nir_intrinsic_has_io_semantics(intr)) {
sem = nir_intrinsic_io_semantics(intr);
assert(sem.location < NUM_TOTAL_VARYING_SLOTS);
index = sem.location * 8 + sem.high_16bits * 4 +
nir_intrinsic_component(intr);
}
switch (intr->intrinsic) {
case nir_intrinsic_load_input:
case nir_intrinsic_load_per_primitive_input:
case nir_intrinsic_load_input_vertex:
case nir_intrinsic_load_interpolated_input:
case nir_intrinsic_load_per_vertex_input:
if (!(modes & nir_var_shader_in))
continue;
break;
case nir_intrinsic_load_output:
case nir_intrinsic_load_per_vertex_output:
case nir_intrinsic_load_per_view_output:
case nir_intrinsic_load_per_primitive_output:
case nir_intrinsic_store_output:
case nir_intrinsic_store_per_vertex_output:
case nir_intrinsic_store_per_view_output:
case nir_intrinsic_store_per_primitive_output:
if (!(modes & nir_var_shader_out))
continue;
/* Break the batch if an output load is followed by an output
* store to the same channel and vice versa.
*/
if (BITSET_TEST(is_load ? has_output_stores : has_output_loads,
index)) {
progress |= vectorize_batch(&io_instructions, allow_holes);
BITSET_ZERO(has_output_loads);
BITSET_ZERO(has_output_stores);
}
is_output = true;
break;
case nir_intrinsic_barrier:
/* Don't vectorize across TCS barriers. */
if (modes & nir_var_shader_out &&
nir_intrinsic_memory_modes(intr) & nir_var_shader_out) {
progress |= vectorize_batch(&io_instructions, allow_holes);
BITSET_ZERO(has_output_loads);
BITSET_ZERO(has_output_stores);
}
continue;
case nir_intrinsic_emit_vertex:
/* Don't vectorize across GS emits. */
progress |= vectorize_batch(&io_instructions, allow_holes);
BITSET_ZERO(has_output_loads);
BITSET_ZERO(has_output_stores);
continue;
default:
continue;
}
/* Only scalar 16 and 32-bit instructions are allowed. */
ASSERTED nir_def *value = is_load ? &intr->def : intr->src[0].ssa;
assert(value->num_components == 1);
assert(value->bit_size == 16 || value->bit_size == 32);
util_dynarray_append(&io_instructions, void *, intr);
if (is_output)
BITSET_SET(is_load ? has_output_loads : has_output_stores, index);
}
progress |= vectorize_batch(&io_instructions, allow_holes);
}
nir_progress(progress, impl,
nir_metadata_block_index | nir_metadata_dominance);
global_progress |= progress;
}
util_dynarray_fini(&io_instructions);
return global_progress;
}