blob: 7016eefe043f4789aab05c877c0b820b2b7335e5 [file] [log] [blame]
/*
* Copyright (C) 2021 Alyssa Rosenzweig <alyssa@rosenzweig.io>
* Copyright (C) 2020 Collabora Ltd.
* Copyright © 2016 Broadcom
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "main/glheader.h"
#include "compiler/nir_types.h"
#include "compiler/nir/nir_builder.h"
#include "util/u_debug.h"
#include "util/fast_idiv_by_const.h"
#include "agx_compile.h"
#include "agx_compiler.h"
#include "agx_builder.h"
/* Alignment for shader programs. I'm not sure what the optimal value is. */
#define AGX_CODE_ALIGN 0x100
static const struct debug_named_value agx_debug_options[] = {
{"msgs", AGX_DBG_MSGS, "Print debug messages"},
{"shaders", AGX_DBG_SHADERS, "Dump shaders in NIR and AIR"},
{"shaderdb", AGX_DBG_SHADERDB, "Print statistics"},
{"verbose", AGX_DBG_VERBOSE, "Disassemble verbosely"},
{"internal", AGX_DBG_INTERNAL, "Dump even internal shaders"},
{"novalidate",AGX_DBG_NOVALIDATE,"Skip IR validation in debug builds"},
{"noopt", AGX_DBG_NOOPT, "Disable backend optimizations"},
DEBUG_NAMED_VALUE_END
};
DEBUG_GET_ONCE_FLAGS_OPTION(agx_debug, "AGX_MESA_DEBUG", agx_debug_options, 0)
int agx_debug = 0;
#define DBG(fmt, ...) \
do { if (agx_debug & AGX_DBG_MSGS) \
fprintf(stderr, "%s:%d: "fmt, \
__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
static agx_index
agx_cached_preload(agx_context *ctx, agx_index *cache, unsigned base, enum agx_size size)
{
if (agx_is_null(*cache)) {
agx_block *block = agx_start_block(ctx);
agx_builder b = agx_init_builder(ctx, agx_before_block(block));
*cache = agx_preload(&b, agx_register(base, size));
}
return *cache;
}
static agx_index
agx_vertex_id(agx_builder *b)
{
return agx_cached_preload(b->shader, &b->shader->vertex_id, 10, AGX_SIZE_32);
}
static agx_index
agx_instance_id(agx_builder *b)
{
return agx_cached_preload(b->shader, &b->shader->instance_id, 12, AGX_SIZE_32);
}
static agx_index
agx_get_cf(agx_context *ctx, bool smooth, bool perspective,
gl_varying_slot slot, unsigned offset, unsigned count)
{
struct agx_varyings_fs *varyings = &ctx->out->varyings.fs;
unsigned cf_base = varyings->nr_cf;
if (slot == VARYING_SLOT_POS) {
assert(offset == 2 || offset == 3);
varyings->reads_z |= (offset == 2);
}
/* First, search for an appropriate binding. This is O(n) to the number of
* bindings, which isn't great, but n should be small in practice.
*/
for (unsigned b = 0; b < varyings->nr_bindings; ++b) {
if ((varyings->bindings[b].slot == slot) &&
(varyings->bindings[b].offset == offset) &&
(varyings->bindings[b].count == count) &&
(varyings->bindings[b].smooth == smooth) &&
(varyings->bindings[b].perspective == perspective)) {
return agx_immediate(varyings->bindings[b].cf_base);
}
}
/* If we didn't find one, make one */
unsigned b = varyings->nr_bindings++;
varyings->bindings[b].cf_base = varyings->nr_cf;
varyings->bindings[b].slot = slot;
varyings->bindings[b].offset = offset;
varyings->bindings[b].count = count;
varyings->bindings[b].smooth = smooth;
varyings->bindings[b].perspective = perspective;
varyings->nr_cf += count;
return agx_immediate(cf_base);
}
/* Builds a 64-bit hash table key for an index */
static uint64_t
agx_index_to_key(agx_index idx)
{
STATIC_ASSERT(sizeof(idx) <= sizeof(uint64_t));
uint64_t key = 0;
memcpy(&key, &idx, sizeof(idx));
return key;
}
/*
* Extract a single channel out of a vector source. We split vectors with
* p_split so we can use the split components directly, without emitting a
* machine instruction. This has advantages of RA, as the split can usually be
* optimized away.
*/
static agx_index
agx_emit_extract(agx_builder *b, agx_index vec, unsigned channel)
{
agx_index *components = _mesa_hash_table_u64_search(b->shader->allocated_vec,
agx_index_to_key(vec));
assert(components != NULL && "missing agx_emit_collect_to");
return components[channel];
}
static void
agx_cache_collect(agx_builder *b, agx_index dst, unsigned nr_srcs,
agx_index *srcs)
{
/* Lifetime of a hash table entry has to be at least as long as the table */
agx_index *channels = ralloc_array(b->shader, agx_index, nr_srcs);
for (unsigned i = 0; i < nr_srcs; ++i)
channels[i] = srcs[i];
_mesa_hash_table_u64_insert(b->shader->allocated_vec, agx_index_to_key(dst),
channels);
}
/*
* Combine multiple scalars into a vector destination. This corresponds to
* collect, lowered to moves (a shuffle in general) after register allocation.
*
* To optimize vector extractions, we record the individual channels
*/
static agx_instr *
agx_emit_collect_to(agx_builder *b, agx_index dst, unsigned nr_srcs,
agx_index *srcs)
{
agx_cache_collect(b, dst, nr_srcs, srcs);
if (nr_srcs == 1)
return agx_mov_to(b, dst, srcs[0]);
agx_instr *I = agx_collect_to(b, dst, nr_srcs);
agx_foreach_src(I, s)
I->src[s] = srcs[s];
return I;
}
static agx_index
agx_vec4(agx_builder *b, agx_index s0, agx_index s1, agx_index s2, agx_index s3)
{
agx_index dst = agx_temp(b->shader, s0.size);
agx_index idx[4] = { s0, s1, s2, s3 };
agx_emit_collect_to(b, dst, 4, idx);
return dst;
}
static agx_index
agx_vec2(agx_builder *b, agx_index s0, agx_index s1)
{
agx_index dst = agx_temp(b->shader, s0.size);
agx_index idx[2] = { s0, s1 };
agx_emit_collect_to(b, dst, 2, idx);
return dst;
}
static void
agx_block_add_successor(agx_block *block, agx_block *successor)
{
assert(block != NULL && successor != NULL);
/* Cull impossible edges */
if (block->unconditional_jumps)
return;
for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {
if (block->successors[i]) {
if (block->successors[i] == successor)
return;
else
continue;
}
block->successors[i] = successor;
util_dynarray_append(&successor->predecessors, agx_block *, block);
return;
}
unreachable("Too many successors");
}
/*
* Splits an n-component vector (vec) into n scalar destinations (dests) using a
* split pseudo-instruction.
*
* Pre-condition: dests is filled with agx_null().
*/
static void
agx_emit_split(agx_builder *b, agx_index *dests, agx_index vec, unsigned n)
{
agx_instr *I = agx_split(b, n, vec);
agx_foreach_dest(I, d) {
dests[d] = agx_temp(b->shader, vec.size);
I->dest[d] = dests[d];
}
}
static void
agx_emit_cached_split(agx_builder *b, agx_index vec, unsigned n)
{
agx_index dests[4] = { agx_null(), agx_null(), agx_null(), agx_null() };
agx_emit_split(b, dests, vec, n);
agx_cache_collect(b, vec, n, dests);
}
static void
agx_emit_load_const(agx_builder *b, nir_load_const_instr *instr)
{
/* Ensure we've been scalarized and bit size lowered */
unsigned bit_size = instr->def.bit_size;
assert(instr->def.num_components == 1);
/* Emit move, later passes can inline/push if useful */
agx_mov_imm_to(b,
agx_get_index(instr->def.index, agx_size_for_bits(bit_size)),
nir_const_value_as_uint(instr->value[0], bit_size));
}
/*
* Implement umul_high of 32-bit sources by doing a 32x32->64-bit multiply and
* extracting only the high word.
*/
static agx_instr *
agx_umul_high_to(agx_builder *b, agx_index dst, agx_index P, agx_index Q)
{
assert(P.size == Q.size && "source sizes must match");
assert(P.size == dst.size && "dest size must match");
assert(P.size != AGX_SIZE_64 && "64x64 multiply should have been lowered");
static_assert(AGX_SIZE_64 == (AGX_SIZE_32 + 1), "enum wrong");
static_assert(AGX_SIZE_32 == (AGX_SIZE_16 + 1), "enum wrong");
agx_index product = agx_temp(b->shader, P.size + 1);
agx_imad_to(b, product, agx_abs(P), agx_abs(Q), agx_zero(), 0);
agx_instr *split = agx_split(b, 2, product);
split->dest[1] = dst;
return split;
}
static agx_index
agx_umul_high(agx_builder *b, agx_index P, agx_index Q)
{
agx_index dst = agx_temp(b->shader, P.size);
agx_umul_high_to(b, dst, P, Q);
return dst;
}
/* Emit code dividing P by Q */
static agx_index
agx_udiv_const(agx_builder *b, agx_index P, uint32_t Q)
{
/* P / 1 = P */
if (Q == 1) {
return P;
}
/* P / UINT32_MAX = 0, unless P = UINT32_MAX when it's one */
if (Q == UINT32_MAX) {
agx_index max = agx_mov_imm(b, 32, UINT32_MAX);
agx_index one = agx_mov_imm(b, 32, 1);
return agx_icmpsel(b, P, max, one, agx_zero(), AGX_ICOND_UEQ);
}
/* P / 2^N = P >> N */
if (util_is_power_of_two_or_zero(Q)) {
return agx_ushr(b, P, agx_mov_imm(b, 32, util_logbase2(Q)));
}
/* Fall back on multiplication by a magic number */
struct util_fast_udiv_info info = util_compute_fast_udiv_info(Q, 32, 32);
agx_index preshift = agx_mov_imm(b, 32, info.pre_shift);
agx_index increment = agx_mov_imm(b, 32, info.increment);
agx_index postshift = agx_mov_imm(b, 32, info.post_shift);
agx_index multiplier = agx_mov_imm(b, 32, info.multiplier);
agx_index n = P;
if (info.pre_shift != 0) n = agx_ushr(b, n, preshift);
if (info.increment != 0) n = agx_iadd(b, n, increment, 0);
n = agx_umul_high(b, n, multiplier);
if (info.post_shift != 0) n = agx_ushr(b, n, postshift);
return n;
}
/* AGX appears to lack support for vertex attributes. Lower to global loads. */
static void
agx_emit_load_attr(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
{
nir_src *offset_src = nir_get_io_offset_src(instr);
assert(nir_src_is_const(*offset_src) && "no attribute indirects");
unsigned index = nir_intrinsic_base(instr) +
nir_src_as_uint(*offset_src);
struct agx_shader_key *key = b->shader->key;
struct agx_attribute attrib = key->vs.attributes[index];
/* address = base + (stride * vertex_id) + src_offset */
unsigned buf = attrib.buf;
unsigned stride = key->vs.vbuf_strides[buf];
unsigned shift = agx_format_shift(attrib.format);
agx_index shifted_stride = agx_mov_imm(b, 32, stride >> shift);
agx_index src_offset = agx_mov_imm(b, 32, attrib.src_offset);
/* A nonzero divisor requires dividing the instance ID. A zero divisor
* specifies per-instance data. */
agx_index element_id = (attrib.divisor == 0) ? agx_vertex_id(b) :
agx_udiv_const(b, agx_instance_id(b), attrib.divisor);
agx_index offset = agx_imad(b, element_id, shifted_stride, src_offset, 0);
/* Each VBO has a 64-bit = 4 x 16-bit address, lookup the base address as a
* sysval. Mov around the base to handle uniform restrictions, copyprop will
* usually clean that up.
*/
agx_index base = agx_mov(b, agx_vbo_base(b->shader, buf));
/* Load the data */
assert(instr->num_components <= 4);
unsigned actual_comps = (attrib.nr_comps_minus_1 + 1);
agx_index vec = agx_vec_for_dest(b->shader, &instr->dest);
agx_device_load_to(b, vec, base, offset, attrib.format,
BITFIELD_MASK(attrib.nr_comps_minus_1 + 1), 0);
agx_wait(b, 0);
agx_index dests[4] = { agx_null() };
agx_emit_split(b, dests, vec, actual_comps);
agx_index one = agx_mov_imm(b, 32, fui(1.0));
agx_index zero = agx_mov_imm(b, 32, 0);
agx_index default_value[4] = { zero, zero, zero, one };
for (unsigned i = actual_comps; i < instr->num_components; ++i)
dests[i] = default_value[i];
agx_emit_collect_to(b, dest, instr->num_components, dests);
}
static void
agx_emit_load_vary_flat(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
{
unsigned components = instr->num_components;
assert(components >= 1 && components <= 4);
nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
nir_src *offset = nir_get_io_offset_src(instr);
assert(nir_src_is_const(*offset) && "no indirects");
assert(nir_dest_bit_size(instr->dest) == 32 && "no 16-bit flat shading");
/* Get all coefficient registers up front. This ensures the driver emits a
* single vectorized binding.
*/
agx_index cf = agx_get_cf(b->shader, false, false,
sem.location + nir_src_as_uint(*offset), 0,
components);
agx_index dests[4] = { agx_null() };
for (unsigned i = 0; i < components; ++i) {
/* vec3 for each vertex, unknown what first 2 channels are for */
agx_index d[3] = { agx_null() };
agx_index tmp = agx_temp(b->shader, AGX_SIZE_32);
agx_ldcf_to(b, tmp, cf, 1);
agx_emit_split(b, d, tmp, 3);
dests[i] = d[2];
/* Each component accesses a sequential coefficient register */
cf.value++;
}
agx_emit_collect_to(b, dest, components, dests);
}
static void
agx_emit_load_vary(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
{
ASSERTED unsigned components = instr->num_components;
nir_intrinsic_instr *bary = nir_src_as_intrinsic(instr->src[0]);
assert(components >= 1 && components <= 4);
/* TODO: Interpolation modes */
assert(bary != NULL);
assert(bary->intrinsic == nir_intrinsic_load_barycentric_pixel);
bool perspective =
nir_intrinsic_interp_mode(bary) != INTERP_MODE_NOPERSPECTIVE;
nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
nir_src *offset = nir_get_io_offset_src(instr);
assert(nir_src_is_const(*offset) && "no indirects");
/* For perspective interpolation, we need W */
agx_index J = !perspective ? agx_zero() :
agx_get_cf(b->shader, true, false, VARYING_SLOT_POS, 3, 1);
agx_index I = agx_get_cf(b->shader, true, perspective,
sem.location + nir_src_as_uint(*offset), 0,
components);
agx_iter_to(b, dest, I, J, components, perspective);
agx_emit_cached_split(b, dest, components);
}
static agx_instr *
agx_emit_store_vary(agx_builder *b, nir_intrinsic_instr *instr)
{
nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
nir_src *offset = nir_get_io_offset_src(instr);
assert(nir_src_is_const(*offset) && "todo: indirects");
unsigned imm_index = b->shader->out->varyings.vs.slots[sem.location];
assert(imm_index < ~0);
imm_index += nir_intrinsic_component(instr);
imm_index += nir_src_as_uint(*offset);
/* nir_lower_io_to_scalar */
assert(nir_intrinsic_write_mask(instr) == 0x1);
return agx_st_vary(b,
agx_immediate(imm_index),
agx_src_index(&instr->src[0]));
}
static agx_instr *
agx_emit_fragment_out(agx_builder *b, nir_intrinsic_instr *instr)
{
nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
unsigned loc = sem.location;
assert(sem.dual_source_blend_index == 0 && "todo: dual-source blending");
assert(loc == FRAG_RESULT_DATA0 && "todo: MRT");
unsigned rt = (loc - FRAG_RESULT_DATA0);
/* TODO: Reverse-engineer interactions with MRT */
if (b->shader->key->fs.ignore_tib_dependencies) {
assert(b->shader->nir->info.internal && "only for clear shaders");
} else if (b->shader->did_writeout) {
agx_writeout(b, 0x0004);
} else {
agx_writeout(b, 0xC200);
agx_writeout(b, 0x000C);
}
if (b->shader->nir->info.fs.uses_discard) {
/* If the shader uses discard, the sample mask must be written by the
* shader on all exeuction paths. If we've reached the end of the shader,
* we are therefore still active and need to write a full sample mask.
* TODO: interactions with MSAA and gl_SampleMask writes
*/
agx_sample_mask(b, agx_immediate(1));
}
b->shader->did_writeout = true;
return agx_st_tile(b, agx_src_index(&instr->src[0]),
b->shader->key->fs.tib_formats[rt],
nir_intrinsic_write_mask(instr));
}
static void
agx_emit_load_tile(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
{
nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
unsigned loc = sem.location;
assert(sem.dual_source_blend_index == 0 && "dual src ld_tile is nonsense");
assert(loc == FRAG_RESULT_DATA0 && "todo: MRT");
unsigned rt = (loc - FRAG_RESULT_DATA0);
/* TODO: Reverse-engineer interactions with MRT */
assert(!b->shader->key->fs.ignore_tib_dependencies && "invalid usage");
agx_writeout(b, 0xC200);
agx_writeout(b, 0x0008);
b->shader->did_writeout = true;
b->shader->out->reads_tib = true;
unsigned nr_comps = nir_dest_num_components(instr->dest);
agx_ld_tile_to(b, dest, b->shader->key->fs.tib_formats[rt],
BITFIELD_MASK(nr_comps));
agx_emit_cached_split(b, dest, nr_comps);
}
static enum agx_format
agx_format_for_bits(unsigned bits)
{
switch (bits) {
case 8: return AGX_FORMAT_I8;
case 16: return AGX_FORMAT_I16;
case 32: return AGX_FORMAT_I32;
default: unreachable("Invalid bit size for load/store");
}
}
static void
agx_emit_load_global(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
{
agx_index addr = agx_src_index(&instr->src[0]);
agx_index offset = agx_immediate(0);
enum agx_format fmt = agx_format_for_bits(nir_dest_bit_size(instr->dest));
agx_device_load_to(b, dest, addr, offset, fmt,
BITFIELD_MASK(nir_dest_num_components(instr->dest)), 0);
agx_wait(b, 0);
agx_emit_cached_split(b, dest, nir_dest_num_components(instr->dest));
}
static agx_instr *
agx_emit_load_ubo(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr)
{
nir_src *offset = nir_get_io_offset_src(instr);
if (!nir_src_is_const(instr->src[0]))
unreachable("todo: indirect UBO access");
/* UBO blocks are specified */
uint32_t block = nir_src_as_uint(instr->src[0]);
/* Each UBO has a 64-bit = 4 x 16-bit address */
unsigned num_ubos = b->shader->nir->info.num_ubos;
unsigned base_length = (num_ubos * 4);
unsigned index = block * 4; /* 16 bit units */
/* Lookup the base address (TODO: indirection) */
agx_index base = agx_indexed_sysval(b->shader,
AGX_PUSH_UBO_BASES, AGX_SIZE_64,
index, base_length);
/* Load the data */
assert(instr->num_components <= 4);
/* Mov around the base to handle uniform restrictions, copyprop will usually
* clean that up.
*/
agx_device_load_to(b, dst, agx_mov(b, base), agx_src_index(offset),
agx_format_for_bits(nir_dest_bit_size(instr->dest)),
BITFIELD_MASK(instr->num_components), 0);
agx_wait(b, 0);
agx_emit_cached_split(b, dst, instr->num_components);
return NULL;
}
/* Preambles write directly to uniform registers, so move from uniform to GPR */
static agx_instr *
agx_emit_load_preamble(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr)
{
assert(nir_dest_num_components(instr->dest) == 1 && "already scalarized");
return agx_mov_to(b, dst, agx_uniform(nir_intrinsic_base(instr), dst.size));
}
static agx_instr *
agx_emit_store_preamble(agx_builder *b, nir_intrinsic_instr *instr)
{
assert(nir_src_num_components(instr->src[0]) == 1 && "already scalarized");
agx_index value = agx_src_index(&instr->src[0]);
agx_index offset = agx_immediate(nir_intrinsic_base(instr));
return agx_uniform_store(b, value, offset);
}
/*
* Emit code to generate gl_FragCoord. The xy components are calculated from
* special registers, whereas the zw components are interpolated varyings.
* Because interpolating varyings requires allocating coefficient registers that
* might not be used, we only emit code for components that are actually used.
*/
static void
agx_emit_load_frag_coord(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr)
{
agx_index dests[4] = { agx_null() };
u_foreach_bit(i, nir_ssa_def_components_read(&instr->dest.ssa)) {
if (i < 2) {
agx_index fp32 = agx_temp(b->shader, AGX_SIZE_32);
agx_convert_to(b, fp32, agx_immediate(AGX_CONVERT_U32_TO_F),
agx_get_sr(b, 32, AGX_SR_THREAD_POSITION_IN_GRID_X + i),
AGX_ROUND_RTE);
dests[i] = agx_fadd(b, fp32, agx_immediate_f(0.5f));
} else {
agx_index cf = agx_get_cf(b->shader, true, false, VARYING_SLOT_POS, i, 1);
dests[i] = agx_iter(b, cf, agx_null(), 1, false);
}
}
agx_emit_collect_to(b, dst, 4, dests);
}
static agx_instr *
agx_blend_const(agx_builder *b, agx_index dst, unsigned comp)
{
agx_index val = agx_indexed_sysval(b->shader,
AGX_PUSH_BLEND_CONST, AGX_SIZE_32, comp * 2, 4 * 2);
return agx_mov_to(b, dst, val);
}
/*
* Demoting a helper invocation is logically equivalent to zeroing the sample
* mask. Metal implement discard as such.
*
* XXX: Actually, Metal's "discard" is a demote, and what is implemented here
* is a demote. There might be a better way to implement this to get correct
* helper invocation semantics. For now, I'm kicking the can down the road.
*/
static agx_instr *
agx_emit_discard(agx_builder *b, nir_intrinsic_instr *instr)
{
assert(!b->shader->key->fs.ignore_tib_dependencies && "invalid usage");
agx_writeout(b, 0xC200);
agx_writeout(b, 0x0001);
b->shader->did_writeout = true;
b->shader->out->writes_sample_mask = true;
return agx_sample_mask(b, agx_immediate(0));
}
static agx_instr *
agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
{
agx_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ?
agx_dest_index(&instr->dest) : agx_null();
gl_shader_stage stage = b->shader->stage;
switch (instr->intrinsic) {
case nir_intrinsic_load_barycentric_pixel:
case nir_intrinsic_load_barycentric_centroid:
case nir_intrinsic_load_barycentric_sample:
case nir_intrinsic_load_barycentric_at_sample:
case nir_intrinsic_load_barycentric_at_offset:
/* handled later via load_vary */
return NULL;
case nir_intrinsic_load_interpolated_input:
assert(stage == MESA_SHADER_FRAGMENT);
agx_emit_load_vary(b, dst, instr);
return NULL;
case nir_intrinsic_load_input:
if (stage == MESA_SHADER_FRAGMENT)
agx_emit_load_vary_flat(b, dst, instr);
else if (stage == MESA_SHADER_VERTEX)
agx_emit_load_attr(b, dst, instr);
else
unreachable("Unsupported shader stage");
return NULL;
case nir_intrinsic_load_global:
case nir_intrinsic_load_global_constant:
agx_emit_load_global(b, dst, instr);
return NULL;
case nir_intrinsic_store_output:
if (stage == MESA_SHADER_FRAGMENT)
return agx_emit_fragment_out(b, instr);
else if (stage == MESA_SHADER_VERTEX)
return agx_emit_store_vary(b, instr);
else
unreachable("Unsupported shader stage");
case nir_intrinsic_load_output:
assert(stage == MESA_SHADER_FRAGMENT);
agx_emit_load_tile(b, dst, instr);
return NULL;
case nir_intrinsic_load_ubo:
return agx_emit_load_ubo(b, dst, instr);
case nir_intrinsic_load_frag_coord:
agx_emit_load_frag_coord(b, dst, instr);
return NULL;
case nir_intrinsic_discard:
return agx_emit_discard(b, instr);
case nir_intrinsic_load_back_face_agx:
return agx_get_sr_to(b, dst, AGX_SR_BACKFACING);
case nir_intrinsic_load_texture_base_agx:
return agx_mov_to(b, dst, agx_indexed_sysval(b->shader,
AGX_PUSH_TEXTURE_BASE, AGX_SIZE_64, 0, 4));
case nir_intrinsic_load_vertex_id:
return agx_mov_to(b, dst, agx_abs(agx_vertex_id(b)));
case nir_intrinsic_load_instance_id:
return agx_mov_to(b, dst, agx_abs(agx_instance_id(b)));
case nir_intrinsic_load_preamble:
return agx_emit_load_preamble(b, dst, instr);
case nir_intrinsic_store_preamble:
return agx_emit_store_preamble(b, instr);
case nir_intrinsic_load_blend_const_color_r_float: return agx_blend_const(b, dst, 0);
case nir_intrinsic_load_blend_const_color_g_float: return agx_blend_const(b, dst, 1);
case nir_intrinsic_load_blend_const_color_b_float: return agx_blend_const(b, dst, 2);
case nir_intrinsic_load_blend_const_color_a_float: return agx_blend_const(b, dst, 3);
default:
fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);
unreachable("Unhandled intrinsic");
}
}
static agx_index
agx_alu_src_index(agx_builder *b, nir_alu_src src)
{
/* Check well-formedness of the input NIR */
ASSERTED unsigned bitsize = nir_src_bit_size(src.src);
unsigned comps = nir_src_num_components(src.src);
unsigned channel = src.swizzle[0];
assert(bitsize == 1 || bitsize == 16 || bitsize == 32 || bitsize == 64);
assert(!(src.negate || src.abs));
assert(channel < comps);
agx_index idx = agx_src_index(&src.src);
/* We only deal with scalars, extract a single scalar if needed */
if (comps > 1)
return agx_emit_extract(b, idx, channel);
else
return idx;
}
static agx_instr *
agx_emit_alu_bool(agx_builder *b, nir_op op,
agx_index dst, agx_index s0, agx_index s1, agx_index s2)
{
/* Handle 1-bit bools as zero/nonzero rather than specifically 0/1 or 0/~0.
* This will give the optimizer flexibility. */
agx_index f = agx_immediate(0);
agx_index t = agx_immediate(0x1);
switch (op) {
case nir_op_feq: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_EQ);
case nir_op_flt: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_LT);
case nir_op_fge: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_GE);
case nir_op_fneu: return agx_fcmpsel_to(b, dst, s0, s1, f, t, AGX_FCOND_EQ);
case nir_op_ieq: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_UEQ);
case nir_op_ine: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_UEQ);
case nir_op_ilt: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_SLT);
case nir_op_ige: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_SLT);
case nir_op_ult: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_ULT);
case nir_op_uge: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_ULT);
case nir_op_mov: return agx_mov_to(b, dst, s0);
case nir_op_iand: return agx_and_to(b, dst, s0, s1);
case nir_op_ior: return agx_or_to(b, dst, s0, s1);
case nir_op_ixor: return agx_xor_to(b, dst, s0, s1);
case nir_op_inot: return agx_xor_to(b, dst, s0, t);
case nir_op_f2b1: return agx_fcmpsel_to(b, dst, s0, f, f, t, AGX_FCOND_EQ);
case nir_op_i2b1: return agx_icmpsel_to(b, dst, s0, f, f, t, AGX_ICOND_UEQ);
case nir_op_b2b1: return agx_icmpsel_to(b, dst, s0, f, f, t, AGX_ICOND_UEQ);
case nir_op_bcsel:
return agx_icmpsel_to(b, dst, s0, f, s2, s1, AGX_ICOND_UEQ);
default:
fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[op].name);
unreachable("Unhandled boolean ALU instruction");
}
}
static agx_instr *
agx_emit_alu(agx_builder *b, nir_alu_instr *instr)
{
unsigned srcs = nir_op_infos[instr->op].num_inputs;
unsigned sz = nir_dest_bit_size(instr->dest.dest);
unsigned src_sz = srcs ? nir_src_bit_size(instr->src[0].src) : 0;
ASSERTED unsigned comps = nir_dest_num_components(instr->dest.dest);
assert(comps == 1 || nir_op_is_vec(instr->op));
assert(sz == 1 || sz == 16 || sz == 32 || sz == 64);
agx_index dst = agx_dest_index(&instr->dest.dest);
agx_index s0 = srcs > 0 ? agx_alu_src_index(b, instr->src[0]) : agx_null();
agx_index s1 = srcs > 1 ? agx_alu_src_index(b, instr->src[1]) : agx_null();
agx_index s2 = srcs > 2 ? agx_alu_src_index(b, instr->src[2]) : agx_null();
agx_index s3 = srcs > 3 ? agx_alu_src_index(b, instr->src[3]) : agx_null();
/* 1-bit bools are a bit special, only handle with select ops */
if (sz == 1)
return agx_emit_alu_bool(b, instr->op, dst, s0, s1, s2);
#define UNOP(nop, aop) \
case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0);
#define BINOP(nop, aop) \
case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0, s1);
#define TRIOP(nop, aop) \
case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0, s1, s2);
switch (instr->op) {
BINOP(fadd, fadd);
BINOP(fmul, fmul);
TRIOP(ffma, fma);
UNOP(f2f16, fmov);
UNOP(f2f32, fmov);
UNOP(fround_even, roundeven);
UNOP(ftrunc, trunc);
UNOP(ffloor, floor);
UNOP(fceil, ceil);
UNOP(frcp, rcp);
UNOP(frsq, rsqrt);
UNOP(flog2, log2);
UNOP(fexp2, exp2);
UNOP(fddx, dfdx);
UNOP(fddx_coarse, dfdx);
UNOP(fddx_fine, dfdx);
UNOP(fddy, dfdy);
UNOP(fddy_coarse, dfdy);
UNOP(fddy_fine, dfdy);
UNOP(mov, mov);
UNOP(u2u16, mov);
UNOP(u2u32, mov);
UNOP(inot, not);
BINOP(iand, and);
BINOP(ior, or);
BINOP(ixor, xor);
case nir_op_fsqrt: return agx_fmul_to(b, dst, s0, agx_srsqrt(b, s0));
case nir_op_fsub: return agx_fadd_to(b, dst, s0, agx_neg(s1));
case nir_op_fabs: return agx_fmov_to(b, dst, agx_abs(s0));
case nir_op_fneg: return agx_fmov_to(b, dst, agx_neg(s0));
case nir_op_fmin: return agx_fcmpsel_to(b, dst, s0, s1, s0, s1, AGX_FCOND_LTN);
case nir_op_fmax: return agx_fcmpsel_to(b, dst, s0, s1, s0, s1, AGX_FCOND_GTN);
case nir_op_imin: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SLT);
case nir_op_imax: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SGT);
case nir_op_umin: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_ULT);
case nir_op_umax: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_UGT);
case nir_op_iadd: return agx_iadd_to(b, dst, s0, s1, 0);
case nir_op_isub: return agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
case nir_op_ineg: return agx_iadd_to(b, dst, agx_zero(), agx_neg(s0), 0);
case nir_op_imul: return agx_imad_to(b, dst, s0, s1, agx_zero(), 0);
case nir_op_umul_high: return agx_umul_high_to(b, dst, s0, s1);
case nir_op_ishl: return agx_bfi_to(b, dst, agx_zero(), s0, s1, 0);
case nir_op_ushr: return agx_ushr_to(b, dst, s0, s1);
case nir_op_ishr: return agx_asr_to(b, dst, s0, s1);
case nir_op_bcsel:
return agx_icmpsel_to(b, dst, s0, agx_zero(), s2, s1, AGX_ICOND_UEQ);
case nir_op_b2i32:
case nir_op_b2i16:
return agx_icmpsel_to(b, dst, s0, agx_zero(), agx_zero(), agx_immediate(1), AGX_ICOND_UEQ);
case nir_op_b2f16:
case nir_op_b2f32:
{
/* At this point, boolean is just zero/nonzero, so compare with zero */
agx_index one = (sz == 16) ?
agx_mov_imm(b, 16, _mesa_float_to_half(1.0)) :
agx_mov_imm(b, 32, fui(1.0));
agx_index zero = agx_zero();
return agx_fcmpsel_to(b, dst, s0, zero, zero, one, AGX_FCOND_EQ);
}
case nir_op_i2i32:
{
if (s0.size != AGX_SIZE_16)
unreachable("todo: more conversions");
return agx_iadd_to(b, dst, s0, agx_zero(), 0);
}
case nir_op_i2i16:
{
if (s0.size != AGX_SIZE_32)
unreachable("todo: more conversions");
return agx_iadd_to(b, dst, s0, agx_zero(), 0);
}
case nir_op_iadd_sat:
{
agx_instr *I = agx_iadd_to(b, dst, s0, s1, 0);
I->saturate = true;
return I;
}
case nir_op_isub_sat:
{
agx_instr *I = agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
I->saturate = true;
return I;
}
case nir_op_uadd_sat:
{
agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_abs(s1), 0);
I->saturate = true;
return I;
}
case nir_op_usub_sat:
{
agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_neg(agx_abs(s1)), 0);
I->saturate = true;
return I;
}
case nir_op_fsat:
{
agx_instr *I = agx_fadd_to(b, dst, s0, agx_negzero());
I->saturate = true;
return I;
}
case nir_op_fsin_agx:
{
agx_index fixup = agx_sin_pt_1(b, s0);
agx_index sinc = agx_sin_pt_2(b, fixup);
return agx_fmul_to(b, dst, sinc, fixup);
}
case nir_op_f2i16:
return agx_convert_to(b, dst,
agx_immediate(AGX_CONVERT_F_TO_S16), s0, AGX_ROUND_RTZ);
case nir_op_f2i32:
return agx_convert_to(b, dst,
agx_immediate(AGX_CONVERT_F_TO_S32), s0, AGX_ROUND_RTZ);
case nir_op_f2u16:
return agx_convert_to(b, dst,
agx_immediate(AGX_CONVERT_F_TO_U16), s0, AGX_ROUND_RTZ);
case nir_op_f2u32:
return agx_convert_to(b, dst,
agx_immediate(AGX_CONVERT_F_TO_U32), s0, AGX_ROUND_RTZ);
case nir_op_u2f16:
case nir_op_u2f32:
{
if (src_sz == 64)
unreachable("64-bit conversions unimplemented");
enum agx_convert mode =
(src_sz == 32) ? AGX_CONVERT_U32_TO_F :
(src_sz == 16) ? AGX_CONVERT_U16_TO_F :
AGX_CONVERT_U8_TO_F;
return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
}
case nir_op_i2f16:
case nir_op_i2f32:
{
if (src_sz == 64)
unreachable("64-bit conversions unimplemented");
enum agx_convert mode =
(src_sz == 32) ? AGX_CONVERT_S32_TO_F :
(src_sz == 16) ? AGX_CONVERT_S16_TO_F :
AGX_CONVERT_S8_TO_F;
return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
}
case nir_op_vec2:
case nir_op_vec3:
case nir_op_vec4:
{
agx_index idx[] = { s0, s1, s2, s3 };
return agx_emit_collect_to(b, dst, srcs, idx);
}
case nir_op_vec8:
case nir_op_vec16:
unreachable("should've been lowered");
default:
fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
unreachable("Unhandled ALU instruction");
}
}
static enum agx_dim
agx_tex_dim(enum glsl_sampler_dim dim, bool array)
{
switch (dim) {
case GLSL_SAMPLER_DIM_1D:
case GLSL_SAMPLER_DIM_BUF:
return array ? AGX_DIM_1D_ARRAY : AGX_DIM_1D;
case GLSL_SAMPLER_DIM_2D:
case GLSL_SAMPLER_DIM_RECT:
case GLSL_SAMPLER_DIM_EXTERNAL:
return array ? AGX_DIM_2D_ARRAY : AGX_DIM_2D;
case GLSL_SAMPLER_DIM_MS:
assert(!array && "multisampled arrays unsupported");
return AGX_DIM_2D_MS;
case GLSL_SAMPLER_DIM_3D:
assert(!array && "3D arrays unsupported");
return AGX_DIM_3D;
case GLSL_SAMPLER_DIM_CUBE:
return array ? AGX_DIM_CUBE_ARRAY : AGX_DIM_CUBE;
default:
unreachable("Invalid sampler dim\n");
}
}
static enum agx_lod_mode
agx_lod_mode_for_nir(nir_texop op)
{
switch (op) {
case nir_texop_tex: return AGX_LOD_MODE_AUTO_LOD;
case nir_texop_txb: return AGX_LOD_MODE_AUTO_LOD_BIAS;
case nir_texop_txd: return AGX_LOD_MODE_LOD_GRAD;
case nir_texop_txl: return AGX_LOD_MODE_LOD_MIN;
case nir_texop_txf: return AGX_LOD_MODE_LOD_MIN;
default: unreachable("Unhandled texture op");
}
}
static void
agx_emit_tex(agx_builder *b, nir_tex_instr *instr)
{
switch (instr->op) {
case nir_texop_tex:
case nir_texop_txf:
case nir_texop_txl:
case nir_texop_txb:
case nir_texop_txd:
break;
default:
unreachable("Unhandled texture op");
}
agx_index coords = agx_null(),
texture = agx_immediate(instr->texture_index),
sampler = agx_immediate(instr->sampler_index),
lod = agx_immediate(0),
compare = agx_null(),
packed_offset = agx_null();
bool txf = instr->op == nir_texop_txf;
for (unsigned i = 0; i < instr->num_srcs; ++i) {
agx_index index = agx_src_index(&instr->src[i].src);
switch (instr->src[i].src_type) {
case nir_tex_src_coord:
case nir_tex_src_backend1:
coords = index;
break;
case nir_tex_src_lod:
case nir_tex_src_bias:
lod = index;
break;
case nir_tex_src_comparator:
assert(index.size == AGX_SIZE_32);
compare = index;
break;
case nir_tex_src_offset:
{
assert(instr->src[i].src.is_ssa);
nir_ssa_def *def = instr->src[i].src.ssa;
uint32_t packed = 0;
for (unsigned c = 0; c < def->num_components; ++c) {
nir_ssa_scalar s = nir_ssa_scalar_resolved(def, c);
assert(nir_ssa_scalar_is_const(s) && "no nonconstant offsets");
int32_t val = nir_ssa_scalar_as_uint(s);
assert((val >= -8 && val <= 7) && "out of bounds offset");
packed |= (val & 0xF) << (4 * c);
}
packed_offset = agx_mov_imm(b, 32, packed);
break;
}
case nir_tex_src_ddx:
{
int y_idx = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
assert(y_idx >= 0 && "we only handle gradients");
unsigned n = nir_tex_instr_src_size(instr, y_idx);
assert((n == 2 || n == 3) && "other sizes not supported");
agx_index index2 = agx_src_index(&instr->src[y_idx].src);
/* We explicitly don't cache about the split cache for this */
lod = agx_temp(b->shader, AGX_SIZE_32);
agx_instr *I = agx_collect_to(b, lod, 2 * n);
for (unsigned i = 0; i < n; ++i) {
I->src[(2 * i) + 0] = agx_emit_extract(b, index, i);
I->src[(2 * i) + 1] = agx_emit_extract(b, index2, i);
}
break;
}
case nir_tex_src_ddy:
/* handled above */
break;
case nir_tex_src_ms_index:
case nir_tex_src_texture_offset:
case nir_tex_src_sampler_offset:
default:
unreachable("todo");
}
}
agx_index dst = agx_dest_index(&instr->dest);
/* Pack shadow reference value (compare) and packed offset together */
agx_index compare_offset = agx_null();
if (!agx_is_null(compare) && !agx_is_null(packed_offset))
compare_offset = agx_vec2(b, compare, packed_offset);
else if (!agx_is_null(packed_offset))
compare_offset = packed_offset;
else if (!agx_is_null(compare))
compare_offset = compare;
agx_instr *I = agx_texture_sample_to(b, dst, coords, lod, texture, sampler,
compare_offset,
agx_tex_dim(instr->sampler_dim, instr->is_array),
agx_lod_mode_for_nir(instr->op),
0xF, /* TODO: wrmask */
0, !agx_is_null(packed_offset), !agx_is_null(compare));
if (txf)
I->op = AGX_OPCODE_TEXTURE_LOAD;
agx_wait(b, 0);
agx_emit_cached_split(b, dst, 4);
}
/*
* Mark the logical end of the current block by emitting a p_logical_end marker.
* Note if an unconditional jump is emitted (for instance, to break out of a
* loop from inside an if), the block has already reached its logical end so we
* don't re-emit p_logical_end. The validator checks this, and correct register
* allocation depends on it.
*/
static void
agx_emit_logical_end(agx_builder *b)
{
if (!b->shader->current_block->unconditional_jumps)
agx_logical_end(b);
}
/*
* NIR loops are treated as a pair of AGX loops:
*
* do {
* do {
* ...
* } while (0);
* } while (cond);
*
* By manipulating the nesting counter, we may break out of nested loops, so
* under the model, both break and continue may be implemented as breaks, where
* break breaks out of the outer loop (2 layers) and continue breaks out of the
* inner loop (1 layer).
*
* After manipulating the nesting counter directly, pop_exec #0 must be used to
* flush the update to the execution mask.
*/
static void
agx_emit_jump(agx_builder *b, nir_jump_instr *instr)
{
agx_context *ctx = b->shader;
assert (instr->type == nir_jump_break || instr->type == nir_jump_continue);
/* Break out of either one or two loops */
unsigned nestings = b->shader->loop_nesting;
if (instr->type == nir_jump_continue) {
nestings += 1;
agx_block_add_successor(ctx->current_block, ctx->continue_block);
} else if (instr->type == nir_jump_break) {
nestings += 2;
agx_block_add_successor(ctx->current_block, ctx->break_block);
}
/* Update the counter and flush */
agx_nest(b, agx_immediate(nestings));
/* Jumps must come at the end of a block */
agx_emit_logical_end(b);
agx_pop_exec(b, 0);
ctx->current_block->unconditional_jumps = true;
}
static void
agx_emit_phi(agx_builder *b, nir_phi_instr *instr)
{
agx_instr *I = agx_phi_to(b, agx_dest_index(&instr->dest),
exec_list_length(&instr->srcs));
/* Deferred */
I->phi = instr;
}
/* Look up the AGX block corresponding to a given NIR block. Used when
* translating phi nodes after emitting all blocks.
*/
static agx_block *
agx_from_nir_block(agx_context *ctx, nir_block *block)
{
return ctx->indexed_nir_blocks[block->index];
}
static void
agx_emit_phi_deferred(agx_context *ctx, agx_block *block, agx_instr *I)
{
nir_phi_instr *phi = I->phi;
/* Guaranteed by lower_phis_to_scalar */
assert(phi->dest.ssa.num_components == 1);
nir_foreach_phi_src(src, phi) {
agx_block *pred = agx_from_nir_block(ctx, src->pred);
unsigned i = agx_predecessor_index(block, pred);
assert(i < I->nr_srcs);
I->src[i] = agx_src_index(&src->src);
}
}
static void
agx_emit_phis_deferred(agx_context *ctx)
{
agx_foreach_block(ctx, block) {
agx_foreach_phi_in_block(block, I)
agx_emit_phi_deferred(ctx, block, I);
}
}
static void
agx_emit_instr(agx_builder *b, struct nir_instr *instr)
{
switch (instr->type) {
case nir_instr_type_load_const:
agx_emit_load_const(b, nir_instr_as_load_const(instr));
break;
case nir_instr_type_intrinsic:
agx_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
break;
case nir_instr_type_alu:
agx_emit_alu(b, nir_instr_as_alu(instr));
break;
case nir_instr_type_tex:
agx_emit_tex(b, nir_instr_as_tex(instr));
break;
case nir_instr_type_jump:
agx_emit_jump(b, nir_instr_as_jump(instr));
break;
case nir_instr_type_phi:
agx_emit_phi(b, nir_instr_as_phi(instr));
break;
default:
unreachable("should've been lowered");
}
}
static agx_block *
agx_create_block(agx_context *ctx)
{
agx_block *blk = rzalloc(ctx, agx_block);
util_dynarray_init(&blk->predecessors, blk);
return blk;
}
static agx_block *
emit_block(agx_context *ctx, nir_block *block)
{
if (ctx->after_block) {
ctx->current_block = ctx->after_block;
ctx->after_block = NULL;
} else {
ctx->current_block = agx_create_block(ctx);
}
agx_block *blk = ctx->current_block;
list_addtail(&blk->link, &ctx->blocks);
list_inithead(&blk->instructions);
ctx->indexed_nir_blocks[block->index] = blk;
agx_builder _b = agx_init_builder(ctx, agx_after_block(blk));
nir_foreach_instr(instr, block) {
agx_emit_instr(&_b, instr);
}
return blk;
}
static agx_block *
emit_cf_list(agx_context *ctx, struct exec_list *list);
/* Emit if-else as
*
* if_icmp cond != 0
* ...
* else_icmp cond == 0
* ...
* pop_exec
*
* If the else is empty, we can omit the else_icmp. This happens elsewhere, as
* an empty else block can become nonempty after RA due to phi lowering. This is
* not usually optimal, but it's a start.
*/
static void
emit_if(agx_context *ctx, nir_if *nif)
{
agx_block *first_block = ctx->current_block;
agx_builder _b = agx_init_builder(ctx, agx_after_block(first_block));
agx_index cond = agx_src_index(&nif->condition);
agx_emit_logical_end(&_b);
agx_if_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, true);
ctx->loop_nesting++;
/* Emit the two subblocks. */
agx_block *if_block = emit_cf_list(ctx, &nif->then_list);
agx_block *end_then = ctx->current_block;
_b.cursor = agx_after_block(ctx->current_block);
agx_emit_logical_end(&_b);
agx_else_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, false);
agx_block *else_block = emit_cf_list(ctx, &nif->else_list);
agx_block *end_else = ctx->current_block;
ctx->after_block = agx_create_block(ctx);
agx_block_add_successor(first_block, if_block);
agx_block_add_successor(first_block, else_block);
agx_block_add_successor(end_then, ctx->after_block);
agx_block_add_successor(end_else, ctx->after_block);
_b.cursor = agx_after_block(ctx->current_block);
agx_emit_logical_end(&_b);
agx_pop_exec(&_b, 1);
ctx->loop_nesting--;
}
static void
emit_loop(agx_context *ctx, nir_loop *nloop)
{
/* We only track nesting within the innermost loop, so push and reset */
unsigned pushed_nesting = ctx->loop_nesting;
ctx->loop_nesting = 0;
agx_block *popped_break = ctx->break_block;
agx_block *popped_continue = ctx->continue_block;
ctx->break_block = agx_create_block(ctx);
ctx->continue_block = agx_create_block(ctx);
/* Make room for break/continue nesting (TODO: skip if no divergent CF) */
agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
agx_emit_logical_end(&_b);
agx_push_exec(&_b, 2);
/* Fallthrough to body */
agx_block_add_successor(ctx->current_block, ctx->continue_block);
/* Emit the body */
ctx->after_block = ctx->continue_block;
agx_block *start_block = emit_cf_list(ctx, &nloop->body);
/* Fix up the nesting counter via an always true while_icmp, and branch back
* to start of loop if any lanes are active */
_b.cursor = agx_after_block(ctx->current_block);
agx_emit_logical_end(&_b);
agx_while_icmp(&_b, agx_zero(), agx_zero(), 2, AGX_ICOND_UEQ, false);
agx_jmp_exec_any(&_b, start_block);
agx_pop_exec(&_b, 2);
agx_block_add_successor(ctx->current_block, ctx->continue_block);
/* Pop off */
ctx->after_block = ctx->break_block;
ctx->break_block = popped_break;
ctx->continue_block = popped_continue;
/* Update shader-db stats */
++ctx->loop_count;
/* All nested control flow must have finished */
assert(ctx->loop_nesting == 0);
/* Restore loop nesting (we might be inside an if inside an outer loop) */
ctx->loop_nesting = pushed_nesting;
}
/* Before the first control flow structure, the nesting counter needs to be
* zeroed for correct operation. This only happens at most once, since by
* definition this occurs at the end of the first block, which dominates the
* rest of the program. */
static void
emit_first_cf(agx_context *ctx)
{
if (ctx->any_cf)
return;
agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
agx_nest(&_b, agx_immediate(0));
ctx->any_cf = true;
}
static agx_block *
emit_cf_list(agx_context *ctx, struct exec_list *list)
{
agx_block *start_block = NULL;
foreach_list_typed(nir_cf_node, node, node, list) {
switch (node->type) {
case nir_cf_node_block: {
agx_block *block = emit_block(ctx, nir_cf_node_as_block(node));
if (!start_block)
start_block = block;
break;
}
case nir_cf_node_if:
emit_first_cf(ctx);
emit_if(ctx, nir_cf_node_as_if(node));
break;
case nir_cf_node_loop:
emit_first_cf(ctx);
emit_loop(ctx, nir_cf_node_as_loop(node));
break;
default:
unreachable("Unknown control flow");
}
}
return start_block;
}
static void
agx_set_st_vary_final(agx_context *ctx)
{
agx_foreach_instr_global_rev(ctx, I) {
if (I->op == AGX_OPCODE_ST_VARY) {
I->last = true;
return;
}
}
}
static int
agx_dump_stats(agx_context *ctx, unsigned size, char **out)
{
unsigned nr_ins = 0;
/* Count instructions */
agx_foreach_instr_global(ctx, I)
nr_ins++;
/* TODO: Pipe through occupancy */
unsigned nr_threads = 1;
return asprintf(out,
"%s shader: %u inst, %u bytes, %u halfregs, %u threads, "
"%u loops, %u:%u spills:fills",
gl_shader_stage_name(ctx->stage),
nr_ins, size, ctx->max_reg, nr_threads, ctx->loop_count,
ctx->spills, ctx->fills);
}
static int
glsl_type_size(const struct glsl_type *type, bool bindless)
{
return glsl_count_attribute_slots(type, false);
}
static bool
agx_lower_sincos_filter(const nir_instr *instr, UNUSED const void *_)
{
if (instr->type != nir_instr_type_alu)
return false;
nir_alu_instr *alu = nir_instr_as_alu(instr);
return alu->op == nir_op_fsin || alu->op == nir_op_fcos;
}
/* Sine and cosine are implemented via the sin_pt_1 and sin_pt_2 opcodes for
* heavy lifting. sin_pt_2 implements sinc in the first quadrant, expressed in
* turns (sin (tau x) / x), while sin_pt_1 implements a piecewise sign/offset
* fixup to transform a quadrant angle [0, 4] to [-1, 1]. The NIR opcode
* fsin_agx models the fixup, sinc, and multiply to obtain sine, so we just
* need to change units from radians to quadrants modulo turns. Cosine is
* implemented by shifting by one quadrant: cos(x) = sin(x + tau/4).
*/
static nir_ssa_def *
agx_lower_sincos_impl(struct nir_builder *b, nir_instr *instr, UNUSED void *_)
{
nir_alu_instr *alu = nir_instr_as_alu(instr);
nir_ssa_def *x = nir_mov_alu(b, alu->src[0], 1);
nir_ssa_def *turns = nir_fmul_imm(b, x, M_1_PI * 0.5f);
if (alu->op == nir_op_fcos)
turns = nir_fadd_imm(b, turns, 0.25f);
nir_ssa_def *quadrants = nir_fmul_imm(b, nir_ffract(b, turns), 4.0);
return nir_fsin_agx(b, quadrants);
}
static bool
agx_lower_sincos(nir_shader *shader)
{
return nir_shader_lower_instructions(shader,
agx_lower_sincos_filter, agx_lower_sincos_impl, NULL);
}
static bool
agx_lower_front_face(struct nir_builder *b,
nir_instr *instr, UNUSED void *data)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
if (intr->intrinsic != nir_intrinsic_load_front_face)
return false;
assert(intr->dest.is_ssa);
nir_ssa_def *def = &intr->dest.ssa;
assert(def->bit_size == 1);
b->cursor = nir_before_instr(&intr->instr);
nir_ssa_def_rewrite_uses(def, nir_inot(b, nir_load_back_face_agx(b, 1)));
return true;
}
static bool
agx_lower_aligned_offsets(struct nir_builder *b,
nir_instr *instr, UNUSED void *data)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
if (intr->intrinsic != nir_intrinsic_load_ubo)
return false;
b->cursor = nir_before_instr(&intr->instr);
unsigned bytes = nir_dest_bit_size(intr->dest) / 8;
assert(util_is_power_of_two_or_zero(bytes) && bytes != 0);
nir_src *offset = &intr->src[1];
unsigned shift = util_logbase2(bytes);
nir_ssa_def *old = nir_ssa_for_src(b, *offset, 1);
nir_ssa_def *new = nir_ishr_imm(b, old, shift);
nir_instr_rewrite_src_ssa(instr, offset, new);
return true;
}
static void
agx_optimize_nir(nir_shader *nir, unsigned *preamble_size)
{
bool progress;
nir_lower_idiv_options idiv_options = {
.allow_fp16 = true,
};
NIR_PASS_V(nir, nir_lower_regs_to_ssa);
NIR_PASS_V(nir, nir_lower_int64);
NIR_PASS_V(nir, nir_lower_idiv, &idiv_options);
NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false);
NIR_PASS_V(nir, agx_lower_sincos);
NIR_PASS_V(nir, nir_shader_instructions_pass,
agx_lower_front_face,
nir_metadata_block_index | nir_metadata_dominance, NULL);
do {
progress = false;
NIR_PASS(progress, nir, nir_lower_var_copies);
NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
NIR_PASS(progress, nir, nir_copy_prop);
NIR_PASS(progress, nir, nir_opt_remove_phis);
NIR_PASS(progress, nir, nir_lower_phis_to_scalar, true);
NIR_PASS(progress, nir, nir_opt_dce);
NIR_PASS(progress, nir, nir_opt_dead_cf);
NIR_PASS(progress, nir, nir_opt_cse);
NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
NIR_PASS(progress, nir, nir_opt_algebraic);
NIR_PASS(progress, nir, nir_opt_constant_folding);
NIR_PASS(progress, nir, nir_opt_undef);
NIR_PASS(progress, nir, nir_lower_undef_to_zero);
NIR_PASS(progress, nir, nir_opt_loop_unroll);
} while (progress);
NIR_PASS_V(nir, agx_nir_opt_preamble, preamble_size);
NIR_PASS_V(nir, nir_opt_algebraic_late);
NIR_PASS_V(nir, nir_opt_constant_folding);
NIR_PASS_V(nir, nir_copy_prop);
NIR_PASS_V(nir, nir_opt_dce);
NIR_PASS_V(nir, nir_opt_cse);
NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
/* Cleanup optimizations */
nir_move_options move_all =
nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
NIR_PASS_V(nir, nir_opt_sink, move_all);
NIR_PASS_V(nir, nir_opt_move, move_all);
NIR_PASS_V(nir, nir_lower_phis_to_scalar, true);
}
/* ABI: position first, then user, then psiz */
static void
agx_remap_varyings_vs(nir_shader *nir, struct agx_varyings_vs *varyings)
{
unsigned base = 0;
/* Initalize to "nothing is written" */
for (unsigned i = 0; i < ARRAY_SIZE(varyings->slots); ++i)
varyings->slots[i] = ~0;
assert(nir->info.outputs_written & VARYING_BIT_POS);
varyings->slots[VARYING_SLOT_POS] = base;
base += 4;
nir_foreach_shader_out_variable(var, nir) {
unsigned loc = var->data.location;
if(loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ)
continue;
varyings->slots[loc] = base;
base += 4;
}
/* TODO: Link FP16 varyings */
varyings->base_index_fp16 = base;
if (nir->info.outputs_written & VARYING_BIT_PSIZ) {
varyings->slots[VARYING_SLOT_PSIZ] = base;
base += 1;
}
/* All varyings linked now */
varyings->nr_index = base;
}
/*
* Build a bit mask of varyings (by location) that are flatshaded. This
* information is needed by lower_mediump_io.
*/
static uint64_t
agx_flat_varying_mask(nir_shader *nir)
{
uint64_t mask = 0;
assert(nir->info.stage == MESA_SHADER_FRAGMENT);
nir_foreach_shader_in_variable(var, nir) {
if (var->data.interpolation == INTERP_MODE_FLAT)
mask |= BITFIELD64_BIT(var->data.location);
}
return mask;
}
static bool
agx_should_dump(nir_shader *nir, unsigned agx_dbg_bit)
{
return (agx_debug & agx_dbg_bit) &&
!(nir->info.internal && !(agx_debug & AGX_DBG_INTERNAL));
}
static unsigned
agx_compile_function_nir(nir_shader *nir, nir_function_impl *impl,
struct agx_shader_key *key,
struct util_debug_callback *debug,
struct util_dynarray *binary,
struct agx_shader_info *out)
{
nir_index_blocks(impl);
agx_context *ctx = rzalloc(NULL, agx_context);
ctx->nir = nir;
ctx->out = out;
ctx->key = key;
ctx->stage = nir->info.stage;
ctx->allocated_vec = _mesa_hash_table_u64_create(ctx);
ctx->indexed_nir_blocks = rzalloc_array(ctx, agx_block *, impl->num_blocks);
list_inithead(&ctx->blocks);
ctx->alloc = impl->ssa_alloc;
emit_cf_list(ctx, &impl->body);
agx_emit_phis_deferred(ctx);
/* Stop the main shader or preamble shader after the exit block. For real
* functions, we would return here.
*/
agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link);
agx_builder _b = agx_init_builder(ctx, agx_after_block(last_block));
agx_stop(&_b);
/* Index blocks now that we're done emitting so the order is consistent */
agx_foreach_block(ctx, block)
block->index = ctx->num_blocks++;
agx_validate(ctx, "IR translation");
if (agx_should_dump(nir, AGX_DBG_SHADERS))
agx_print_shader(ctx, stdout);
if (likely(!(agx_debug & AGX_DBG_NOOPT))) {
agx_optimizer(ctx);
agx_dce(ctx);
agx_validate(ctx, "Optimization");
if (agx_should_dump(nir, AGX_DBG_SHADERS))
agx_print_shader(ctx, stdout);
}
agx_ra(ctx);
agx_lower_64bit_postra(ctx);
if (ctx->stage == MESA_SHADER_VERTEX)
agx_set_st_vary_final(ctx);
if (agx_should_dump(nir, AGX_DBG_SHADERS))
agx_print_shader(ctx, stdout);
agx_lower_pseudo(ctx);
/* Pad binary */
if (binary->size % AGX_CODE_ALIGN) {
unsigned ngrow = AGX_CODE_ALIGN - (binary->size % AGX_CODE_ALIGN);
memset(util_dynarray_grow_bytes(binary, ngrow, 1), 0, ngrow);
}
unsigned offset = binary->size;
assert((offset % AGX_CODE_ALIGN) == 0);
agx_pack_binary(ctx, binary);
unsigned nr_gprs = ctx->max_reg + 1;
if (impl->function->is_preamble)
out->nr_preamble_gprs = nr_gprs;
else
out->nr_gprs = nr_gprs;
/* Don't dump statistics for preambles, since they're not worth optimizing */
if (!impl->function->is_preamble) {
char *stats;
int ret = agx_dump_stats(ctx, binary->size, &stats);
if (ret >= 0) {
if (agx_should_dump(nir, AGX_DBG_SHADERDB))
fprintf(stderr, "SHADER-DB: %s - %s\n", nir->info.label ?: "", stats);
if (debug)
util_debug_message(debug, SHADER_INFO, "%s", stats);
free(stats);
}
}
ralloc_free(ctx);
return offset;
}
void
agx_compile_shader_nir(nir_shader *nir,
struct agx_shader_key *key,
struct util_debug_callback *debug,
struct util_dynarray *binary,
struct agx_shader_info *out)
{
agx_debug = debug_get_option_agx_debug();
memset(out, 0, sizeof *out);
if (nir->info.stage == MESA_SHADER_VERTEX) {
out->writes_psiz = nir->info.outputs_written &
BITFIELD_BIT(VARYING_SLOT_PSIZ);
} else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
out->no_colour_output = !(nir->info.outputs_written >> FRAG_RESULT_DATA0);
}
NIR_PASS_V(nir, nir_lower_vars_to_ssa);
/* Lower large arrays to scratch and small arrays to csel */
NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16,
glsl_get_natural_size_align_bytes);
NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
NIR_PASS_V(nir, nir_split_var_copies);
NIR_PASS_V(nir, nir_lower_global_vars_to_local);
NIR_PASS_V(nir, nir_lower_var_copies);
NIR_PASS_V(nir, nir_lower_vars_to_ssa);
NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
glsl_type_size, 0);
if (nir->info.stage == MESA_SHADER_FRAGMENT) {
/* Interpolate varyings at fp16 and write to the tilebuffer at fp16. As an
* exception, interpolate flat shaded at fp32. This works around a
* hardware limitation. The resulting code (with an extra f2f16 at the end
* if needed) matches what Metal produces.
*/
NIR_PASS_V(nir, nir_lower_mediump_io,
nir_var_shader_in | nir_var_shader_out,
~agx_flat_varying_mask(nir), false);
}
NIR_PASS_V(nir, nir_shader_instructions_pass,
agx_lower_aligned_offsets,
nir_metadata_block_index | nir_metadata_dominance, NULL);
NIR_PASS_V(nir, nir_lower_ssbo);
/* Varying output is scalar, other I/O is vector */
if (nir->info.stage == MESA_SHADER_VERTEX) {
NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out);
}
nir_lower_tex_options lower_tex_options = {
.lower_txp = ~0,
.lower_invalid_implicit_lod = true,
/* XXX: Metal seems to handle just like 3D txd, so why doesn't it work?
* TODO: Stop using this lowering
*/
.lower_txd_cube_map = true,
};
nir_tex_src_type_constraints tex_constraints = {
[nir_tex_src_lod] = { true, 16 },
[nir_tex_src_bias] = { true, 16 },
};
NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
NIR_PASS_V(nir, agx_nir_lower_array_texture);
NIR_PASS_V(nir, agx_lower_resinfo);
NIR_PASS_V(nir, nir_legalize_16bit_sampler_srcs, tex_constraints);
agx_optimize_nir(nir, &out->push_count);
/* Implement conditional discard with real control flow like Metal */
NIR_PASS_V(nir, nir_lower_discard_if, (nir_lower_discard_if_to_cf |
nir_lower_demote_if_to_cf |
nir_lower_terminate_if_to_cf));
/* Must be last since NIR passes can remap driver_location freely */
if (nir->info.stage == MESA_SHADER_VERTEX)
agx_remap_varyings_vs(nir, &out->varyings.vs);
if (agx_should_dump(nir, AGX_DBG_SHADERS))
nir_print_shader(nir, stdout);
nir_foreach_function(func, nir) {
if (!func->impl) continue;
unsigned offset = agx_compile_function_nir(nir, func->impl, key, debug, binary, out);
if (func->is_preamble) {
out->preamble_offset = offset;
out->has_preamble = true;
} else if (func->is_entrypoint) {
out->main_offset = offset;
} else {
unreachable("General functions not yet supported");
}
}
}