blob: f0aab763e38f5df999dbadf9b305e97129b97e9f [file] [log] [blame]
/*
* Copyright (C) 2020 Collabora Ltd.
* Copyright (C) 2022 Alyssa Rosenzweig <alyssa@rosenzweig.io>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Authors (Collabora):
* Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
*/
#include "compiler/glsl/glsl_to_nir.h"
#include "compiler/nir_types.h"
#include "compiler/nir/nir_builder.h"
#include "compiler/nir/nir_schedule.h"
#include "util/u_debug.h"
#include "disassemble.h"
#include "valhall/va_compiler.h"
#include "valhall/disassemble.h"
#include "bifrost_compile.h"
#include "compiler.h"
#include "valhall/va_compiler.h"
#include "bi_quirks.h"
#include "bi_builder.h"
#include "bifrost_nir.h"
static const struct debug_named_value bifrost_debug_options[] = {
{"msgs", BIFROST_DBG_MSGS, "Print debug messages"},
{"shaders", BIFROST_DBG_SHADERS, "Dump shaders in NIR and MIR"},
{"shaderdb", BIFROST_DBG_SHADERDB, "Print statistics"},
{"verbose", BIFROST_DBG_VERBOSE, "Disassemble verbosely"},
{"internal", BIFROST_DBG_INTERNAL, "Dump even internal shaders"},
{"nosched", BIFROST_DBG_NOSCHED, "Force trivial bundling"},
{"nopsched", BIFROST_DBG_NOPSCHED, "Disable scheduling for pressure"},
{"inorder", BIFROST_DBG_INORDER, "Force in-order bundling"},
{"novalidate",BIFROST_DBG_NOVALIDATE, "Skip IR validation"},
{"noopt", BIFROST_DBG_NOOPT, "Skip optimization passes"},
{"noidvs", BIFROST_DBG_NOIDVS, "Disable IDVS"},
{"nosb", BIFROST_DBG_NOSB, "Disable scoreboarding"},
{"nopreload", BIFROST_DBG_NOPRELOAD, "Disable message preloading"},
{"spill", BIFROST_DBG_SPILL, "Test register spilling"},
DEBUG_NAMED_VALUE_END
};
DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG", bifrost_debug_options, 0)
/* How many bytes are prefetched by the Bifrost shader core. From the final
* clause of the shader, this range must be valid instructions or zero. */
#define BIFROST_SHADER_PREFETCH 128
int bifrost_debug = 0;
#define DBG(fmt, ...) \
do { if (bifrost_debug & BIFROST_DBG_MSGS) \
fprintf(stderr, "%s:%d: "fmt, \
__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list);
static bi_index
bi_preload(bi_builder *b, unsigned reg)
{
if (bi_is_null(b->shader->preloaded[reg])) {
/* Insert at the beginning of the shader */
bi_builder b_ = *b;
b_.cursor = bi_before_block(bi_start_block(&b->shader->blocks));
/* Cache the result */
b->shader->preloaded[reg] = bi_mov_i32(&b_, bi_register(reg));
}
return b->shader->preloaded[reg];
}
static bi_index
bi_coverage(bi_builder *b)
{
if (bi_is_null(b->shader->coverage))
b->shader->coverage = bi_preload(b, 60);
return b->shader->coverage;
}
/*
* Vertex ID and Instance ID are preloaded registers. Where they are preloaded
* changed from Bifrost to Valhall. Provide helpers that smooth over the
* architectural difference.
*/
static inline bi_index
bi_vertex_id(bi_builder *b)
{
return bi_preload(b, (b->shader->arch >= 9) ? 60 : 61);
}
static inline bi_index
bi_instance_id(bi_builder *b)
{
return bi_preload(b, (b->shader->arch >= 9) ? 61 : 62);
}
static void
bi_emit_jump(bi_builder *b, nir_jump_instr *instr)
{
bi_instr *branch = bi_jump(b, bi_zero());
switch (instr->type) {
case nir_jump_break:
branch->branch_target = b->shader->break_block;
break;
case nir_jump_continue:
branch->branch_target = b->shader->continue_block;
break;
default:
unreachable("Unhandled jump type");
}
bi_block_add_successor(b->shader->current_block, branch->branch_target);
b->shader->current_block->unconditional_jumps = true;
}
/* Builds a 64-bit hash table key for an index */
static uint64_t
bi_index_to_key(bi_index idx)
{
static_assert(sizeof(idx) <= sizeof(uint64_t), "too much padding");
uint64_t key = 0;
memcpy(&key, &idx, sizeof(idx));
return key;
}
/*
* Extract a single channel out of a vector source. We split vectors with SPLIT
* so we can use the split components directly, without emitting an extract.
* This has advantages of RA, as the split can usually be optimized away.
*/
static bi_index
bi_extract(bi_builder *b, bi_index vec, unsigned channel)
{
bi_index *components =
_mesa_hash_table_u64_search(b->shader->allocated_vec,
bi_index_to_key(vec));
/* No extract needed for scalars.
*
* This is a bit imprecise, but actual bugs (missing splits for vectors)
* should be caught by the following assertion. It is too difficult to
* ensure bi_extract is only called for real vectors.
*/
if (components == NULL && channel == 0)
return vec;
assert(components != NULL && "missing bi_cache_collect()");
return components[channel];
}
static void
bi_cache_collect(bi_builder *b, bi_index dst, bi_index *s, unsigned n)
{
/* Lifetime of a hash table entry has to be at least as long as the table */
bi_index *channels = ralloc_array(b->shader, bi_index, n);
memcpy(channels, s, sizeof(bi_index) * n);
_mesa_hash_table_u64_insert(b->shader->allocated_vec,
bi_index_to_key(dst), channels);
}
/*
* Splits an n-component vector (vec) into n scalar destinations (dests) using a
* split pseudo-instruction.
*
* Pre-condition: dests is filled with bi_null().
*/
static void
bi_emit_split_i32(bi_builder *b, bi_index dests[4], bi_index vec, unsigned n)
{
/* Setup the destinations */
for (unsigned i = 0; i < n; ++i) {
dests[i] = bi_temp(b->shader);
}
/* Emit the split */
if (n == 1) {
bi_mov_i32_to(b, dests[0], vec);
} else {
bi_instr *I = bi_split_i32_to(b, n, vec);
bi_foreach_dest(I, j)
I->dest[j] = dests[j];
}
}
static void
bi_emit_cached_split_i32(bi_builder *b, bi_index vec, unsigned n)
{
bi_index dests[4] = { bi_null(), bi_null(), bi_null(), bi_null() };
bi_emit_split_i32(b, dests, vec, n);
bi_cache_collect(b, vec, dests, n);
}
/*
* Emit and cache a split for a vector of a given bitsize. The vector may not be
* composed of 32-bit words, but it will be split at 32-bit word boundaries.
*/
static void
bi_emit_cached_split(bi_builder *b, bi_index vec, unsigned bits)
{
bi_emit_cached_split_i32(b, vec, DIV_ROUND_UP(bits, 32));
}
static void
bi_split_dest(bi_builder *b, nir_dest dest)
{
bi_emit_cached_split(b, bi_dest_index(&dest),
nir_dest_bit_size(dest) *
nir_dest_num_components(dest));
}
static bi_instr *
bi_emit_collect_to(bi_builder *b, bi_index dst, bi_index *chan, unsigned n)
{
/* Special case: COLLECT of a single value is a scalar move */
if (n == 1)
return bi_mov_i32_to(b, dst, chan[0]);
bi_instr *I = bi_collect_i32_to(b, dst, n);
bi_foreach_src(I, i)
I->src[i] = chan[i];
bi_cache_collect(b, dst, chan, n);
return I;
}
static bi_instr *
bi_collect_v2i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1)
{
return bi_emit_collect_to(b, dst, (bi_index[]) { s0, s1 }, 2);
}
static bi_instr *
bi_collect_v3i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1, bi_index s2)
{
return bi_emit_collect_to(b, dst, (bi_index[]) { s0, s1, s2 }, 3);
}
static bi_index
bi_collect_v2i32(bi_builder *b, bi_index s0, bi_index s1)
{
bi_index dst = bi_temp(b->shader);
bi_collect_v2i32_to(b, dst, s0, s1);
return dst;
}
static bi_index
bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
{
switch (intr->intrinsic) {
case nir_intrinsic_load_barycentric_centroid:
case nir_intrinsic_load_barycentric_sample:
return bi_preload(b, 61);
/* Need to put the sample ID in the top 16-bits */
case nir_intrinsic_load_barycentric_at_sample:
return bi_mkvec_v2i16(b, bi_half(bi_dontcare(b), false),
bi_half(bi_src_index(&intr->src[0]), false));
/* Interpret as 8:8 signed fixed point positions in pixels along X and
* Y axes respectively, relative to top-left of pixel. In NIR, (0, 0)
* is the center of the pixel so we first fixup and then convert. For
* fp16 input:
*
* f2i16(((x, y) + (0.5, 0.5)) * 2**8) =
* f2i16((256 * (x, y)) + (128, 128)) =
* V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128))
*
* For fp32 input, that lacks enough precision for MSAA 16x, but the
* idea is the same. FIXME: still doesn't pass
*/
case nir_intrinsic_load_barycentric_at_offset: {
bi_index offset = bi_src_index(&intr->src[0]);
bi_index f16 = bi_null();
unsigned sz = nir_src_bit_size(intr->src[0]);
if (sz == 16) {
f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0),
bi_imm_f16(128.0));
} else {
assert(sz == 32);
bi_index f[2];
for (unsigned i = 0; i < 2; ++i) {
f[i] = bi_fadd_rscale_f32(b,
bi_extract(b, offset, i),
bi_imm_f32(0.5), bi_imm_u32(8),
BI_SPECIAL_NONE);
}
f16 = bi_v2f32_to_v2f16(b, f[0], f[1]);
}
return bi_v2f16_to_v2s16(b, f16);
}
case nir_intrinsic_load_barycentric_pixel:
default:
return b->shader->arch >= 9 ? bi_preload(b, 61) : bi_dontcare(b);
}
}
static enum bi_sample
bi_interp_for_intrinsic(nir_intrinsic_op op)
{
switch (op) {
case nir_intrinsic_load_barycentric_centroid:
return BI_SAMPLE_CENTROID;
case nir_intrinsic_load_barycentric_sample:
case nir_intrinsic_load_barycentric_at_sample:
return BI_SAMPLE_SAMPLE;
case nir_intrinsic_load_barycentric_at_offset:
return BI_SAMPLE_EXPLICIT;
case nir_intrinsic_load_barycentric_pixel:
default:
return BI_SAMPLE_CENTER;
}
}
/* auto, 64-bit omitted */
static enum bi_register_format
bi_reg_fmt_for_nir(nir_alu_type T)
{
switch (T) {
case nir_type_float16: return BI_REGISTER_FORMAT_F16;
case nir_type_float32: return BI_REGISTER_FORMAT_F32;
case nir_type_int16: return BI_REGISTER_FORMAT_S16;
case nir_type_uint16: return BI_REGISTER_FORMAT_U16;
case nir_type_int32: return BI_REGISTER_FORMAT_S32;
case nir_type_uint32: return BI_REGISTER_FORMAT_U32;
default: unreachable("Invalid type for register format");
}
}
/* Checks if the _IMM variant of an intrinsic can be used, returning in imm the
* immediate to be used (which applies even if _IMM can't be used) */
static bool
bi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate, unsigned max)
{
nir_src *offset = nir_get_io_offset_src(instr);
if (!nir_src_is_const(*offset))
return false;
*immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
return (*immediate) < max;
}
static void
bi_make_vec_to(bi_builder *b, bi_index final_dst,
bi_index *src,
unsigned *channel,
unsigned count,
unsigned bitsize);
/* Bifrost's load instructions lack a component offset despite operating in
* terms of vec4 slots. Usually I/O vectorization avoids nonzero components,
* but they may be unavoidable with separate shaders in use. To solve this, we
* lower to a larger load and an explicit copy of the desired components. */
static void
bi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp)
{
unsigned component = nir_intrinsic_component(instr);
unsigned nr = instr->num_components;
unsigned total = nr + component;
unsigned bitsize = nir_dest_bit_size(instr->dest);
assert(total <= 4 && "should be vec4");
bi_emit_cached_split(b, tmp, total * bitsize);
if (component == 0)
return;
bi_index srcs[] = { tmp, tmp, tmp };
unsigned channels[] = { component, component + 1, component + 2 };
bi_make_vec_to(b, bi_dest_index(&instr->dest),
srcs, channels, nr, nir_dest_bit_size(instr->dest));
}
static void
bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
{
nir_alu_type T = nir_intrinsic_dest_type(instr);
enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
nir_src *offset = nir_get_io_offset_src(instr);
unsigned component = nir_intrinsic_component(instr);
enum bi_vecsize vecsize = (instr->num_components + component - 1);
unsigned imm_index = 0;
unsigned base = nir_intrinsic_base(instr);
bool constant = nir_src_is_const(*offset);
bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader);
bi_instr *I;
if (immediate) {
I = bi_ld_attr_imm_to(b, dest, bi_vertex_id(b),
bi_instance_id(b), regfmt, vecsize,
imm_index);
} else {
bi_index idx = bi_src_index(&instr->src[0]);
if (constant)
idx = bi_imm_u32(imm_index);
else if (base != 0)
idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
I = bi_ld_attr_to(b, dest, bi_vertex_id(b), bi_instance_id(b),
idx, regfmt, vecsize);
}
if (b->shader->arch >= 9)
I->table = PAN_TABLE_ATTRIBUTE;
bi_copy_component(b, instr, dest);
}
/*
* ABI: Special (desktop GL) slots come first, tightly packed. General varyings
* come later, sparsely packed. This handles both linked and separable shaders
* with a common code path, with minimal keying only for desktop GL. Each slot
* consumes 16 bytes (TODO: fp16, partial vectors).
*/
static unsigned
bi_varying_base_bytes(bi_context *ctx, nir_intrinsic_instr *intr)
{
nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
uint32_t mask = ctx->inputs->fixed_varying_mask;
if (sem.location >= VARYING_SLOT_VAR0) {
unsigned nr_special = util_bitcount(mask);
unsigned general_index = (sem.location - VARYING_SLOT_VAR0);
return 16 * (nr_special + general_index);
} else {
return 16 * (util_bitcount(mask & BITFIELD_MASK(sem.location)));
}
}
/*
* Compute the offset in bytes of a varying with an immediate offset, adding the
* offset to the base computed above. Convenience method.
*/
static unsigned
bi_varying_offset(bi_context *ctx, nir_intrinsic_instr *intr)
{
nir_src *src = nir_get_io_offset_src(intr);
assert(nir_src_is_const(*src) && "assumes immediate offset");
return bi_varying_base_bytes(ctx, intr) + (nir_src_as_uint(*src) * 16);
}
static void
bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
{
enum bi_sample sample = BI_SAMPLE_CENTER;
enum bi_update update = BI_UPDATE_STORE;
enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input;
bi_index src0 = bi_null();
unsigned component = nir_intrinsic_component(instr);
enum bi_vecsize vecsize = (instr->num_components + component - 1);
bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader);
unsigned sz = nir_dest_bit_size(instr->dest);
if (smooth) {
nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);
assert(parent);
sample = bi_interp_for_intrinsic(parent->intrinsic);
src0 = bi_varying_src0_for_barycentric(b, parent);
assert(sz == 16 || sz == 32);
regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16
: BI_REGISTER_FORMAT_F32;
} else {
assert(sz == 32);
regfmt = BI_REGISTER_FORMAT_U32;
/* Valhall can't have bi_null() here, although the source is
* logically unused for flat varyings
*/
if (b->shader->arch >= 9)
src0 = bi_preload(b, 61);
/* Gather info as we go */
b->shader->info.bifrost->uses_flat_shading = true;
}
enum bi_source_format source_format =
smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32;
nir_src *offset = nir_get_io_offset_src(instr);
unsigned imm_index = 0;
bool immediate = bi_is_intr_immediate(instr, &imm_index, 20);
bi_instr *I = NULL;
if (b->shader->malloc_idvs && immediate) {
/* Immediate index given in bytes. */
bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt,
sample, source_format, update, vecsize,
bi_varying_offset(b->shader, instr));
} else if (immediate && smooth) {
I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update,
vecsize, imm_index);
} else if (immediate && !smooth) {
I = bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt,
vecsize, imm_index);
} else {
bi_index idx = bi_src_index(offset);
unsigned base = nir_intrinsic_base(instr);
if (b->shader->malloc_idvs) {
/* Index needs to be in bytes, but NIR gives the index
* in slots. For now assume 16 bytes per element.
*/
bi_index idx_bytes = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4));
unsigned vbase = bi_varying_base_bytes(b->shader, instr);
if (vbase != 0)
idx_bytes = bi_iadd_u32(b, idx, bi_imm_u32(vbase), false);
bi_ld_var_buf_to(b, sz, dest, src0, idx_bytes, regfmt,
sample, source_format, update,
vecsize);
} else if (smooth) {
if (base != 0)
idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
I = bi_ld_var_to(b, dest, src0, idx, regfmt, sample,
update, vecsize);
} else {
if (base != 0)
idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
I = bi_ld_var_flat_to(b, dest, idx,
BI_FUNCTION_NONE, regfmt,
vecsize);
}
}
/* Valhall usually uses machine-allocated IDVS. If this is disabled, use
* a simple Midgard-style ABI.
*/
if (b->shader->arch >= 9 && I != NULL)
I->table = PAN_TABLE_ATTRIBUTE;
bi_copy_component(b, instr, dest);
}
static bi_index
bi_make_vec8_helper(bi_builder *b, bi_index *src, unsigned *channel, unsigned count)
{
assert(1 <= count && count <= 4);
bi_index bytes[4] = {
bi_imm_u8(0),
bi_imm_u8(0),
bi_imm_u8(0),
bi_imm_u8(0)
};
for (unsigned i = 0; i < count; ++i) {
unsigned chan = channel ? channel[i] : 0;
bytes[i] = bi_byte(bi_extract(b, src[i], chan >> 2), chan & 3);
}
if (b->shader->arch >= 9) {
bi_index vec = bi_zero();
if (count >= 3)
vec = bi_mkvec_v2i8(b, bytes[2], bytes[3], vec);
return bi_mkvec_v2i8(b, bytes[0], bytes[1], vec);
} else {
return bi_mkvec_v4i8(b, bytes[0], bytes[1], bytes[2], bytes[3]);
}
}
static bi_index
bi_make_vec16_helper(bi_builder *b, bi_index *src, unsigned *channel, unsigned count)
{
unsigned chan0 = channel ? channel[0] : 0;
bi_index w0 = bi_extract(b, src[0], chan0 >> 1);
bi_index h0 = bi_half(w0, chan0 & 1);
/* Zero extend */
if (count == 1)
return bi_mkvec_v2i16(b, h0, bi_imm_u16(0));
/* Else, create a vector */
assert(count == 2);
unsigned chan1 = channel ? channel[1] : 0;
bi_index w1 = bi_extract(b, src[1], chan1 >> 1);
bi_index h1 = bi_half(w1, chan1 & 1);
if (bi_is_word_equiv(w0, w1) && (chan0 & 1) == 0 && ((chan1 & 1) == 1))
return bi_mov_i32(b, w0);
else if (bi_is_word_equiv(w0, w1))
return bi_swz_v2i16(b, bi_swz_16(w0, chan0 & 1, chan1 & 1));
else
return bi_mkvec_v2i16(b, h0, h1);
}
static void
bi_make_vec_to(bi_builder *b, bi_index dst,
bi_index *src,
unsigned *channel,
unsigned count,
unsigned bitsize)
{
assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
unsigned shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
unsigned chan_per_word = 1 << shift;
assert(DIV_ROUND_UP(count * bitsize, 32) <= BI_MAX_SRCS &&
"unnecessarily large vector should have been lowered");
bi_index srcs[BI_MAX_VEC];
for (unsigned i = 0; i < count; i += chan_per_word) {
unsigned rem = MIN2(count - i, chan_per_word);
unsigned *channel_offset = channel ? (channel + i) : NULL;
if (bitsize == 32)
srcs[i] = bi_extract(b, src[i], channel_offset ? *channel_offset : 0);
else if (bitsize == 16)
srcs[i >> 1] = bi_make_vec16_helper(b, src + i, channel_offset, rem);
else
srcs[i >> 2] = bi_make_vec8_helper(b, src + i, channel_offset, rem);
}
bi_emit_collect_to(b, dst, srcs, DIV_ROUND_UP(count, chan_per_word));
}
static inline bi_instr *
bi_load_ubo_to(bi_builder *b, unsigned bitsize, bi_index dest0, bi_index src0,
bi_index src1)
{
bi_instr *I;
if (b->shader->arch >= 9) {
I = bi_ld_buffer_to(b, bitsize, dest0, src0, src1);
I->seg = BI_SEG_UBO;
} else {
I = bi_load_to(b, bitsize, dest0, src0, src1, BI_SEG_UBO, 0);
}
bi_emit_cached_split(b, dest0, bitsize);
return I;
}
static bi_instr *
bi_load_sysval_to(bi_builder *b, bi_index dest, int sysval,
unsigned nr_components, unsigned offset)
{
unsigned sysval_ubo = b->shader->inputs->fixed_sysval_ubo >= 0 ?
b->shader->inputs->fixed_sysval_ubo :
b->shader->nir->info.num_ubos;
unsigned uniform =
pan_lookup_sysval(b->shader->sysval_to_id,
b->shader->info.sysvals,
sysval);
unsigned idx = (uniform * 16) + offset;
return bi_load_ubo_to(b, nr_components * 32, dest,
bi_imm_u32(idx), bi_imm_u32(sysval_ubo));
}
static void
bi_load_sysval_nir(bi_builder *b, nir_intrinsic_instr *intr,
unsigned nr_components, unsigned offset)
{
bi_load_sysval_to(b, bi_dest_index(&intr->dest),
panfrost_sysval_for_instr(&intr->instr, NULL),
nr_components, offset);
}
static bi_index
bi_load_sysval(bi_builder *b, int sysval,
unsigned nr_components, unsigned offset)
{
bi_index tmp = bi_temp(b->shader);
bi_load_sysval_to(b, tmp, sysval, nr_components, offset);
return tmp;
}
static void
bi_load_sample_id_to(bi_builder *b, bi_index dst)
{
/* r61[16:23] contains the sampleID, mask it out. Upper bits
* seem to read garbage (despite being architecturally defined
* as zero), so use a 5-bit mask instead of 8-bits */
bi_rshift_and_i32_to(b, dst, bi_preload(b, 61), bi_imm_u32(0x1f),
bi_imm_u8(16), false);
}
static bi_index
bi_load_sample_id(bi_builder *b)
{
bi_index sample_id = bi_temp(b->shader);
bi_load_sample_id_to(b, sample_id);
return sample_id;
}
static bi_index
bi_pixel_indices(bi_builder *b, unsigned rt)
{
/* We want to load the current pixel. */
struct bifrost_pixel_indices pix = {
.y = BIFROST_CURRENT_PIXEL,
.rt = rt
};
uint32_t indices_u32 = 0;
memcpy(&indices_u32, &pix, sizeof(indices_u32));
bi_index indices = bi_imm_u32(indices_u32);
/* Sample index above is left as zero. For multisampling, we need to
* fill in the actual sample ID in the lower byte */
if (b->shader->inputs->blend.nr_samples > 1)
indices = bi_iadd_u32(b, indices, bi_load_sample_id(b), false);
return indices;
}
/* Source color is passed through r0-r3, or r4-r7 for the second source when
* dual-source blending. Preload the corresponding vector.
*/
static void
bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr)
{
nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
unsigned base = (sem.location == VARYING_SLOT_VAR0) ? 4 : 0;
unsigned size = nir_alu_type_get_type_size(nir_intrinsic_dest_type(instr));
assert(size == 16 || size == 32);
bi_index srcs[] = {
bi_preload(b, base + 0), bi_preload(b, base + 1),
bi_preload(b, base + 2), bi_preload(b, base + 3)
};
bi_emit_collect_to(b, bi_dest_index(&instr->dest), srcs, size == 32 ? 4 : 2);
}
static void
bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T,
bi_index rgba2, nir_alu_type T2, unsigned rt)
{
/* Reads 2 or 4 staging registers to cover the input */
unsigned size = nir_alu_type_get_type_size(T);
unsigned size_2 = nir_alu_type_get_type_size(T2);
unsigned sr_count = (size <= 16) ? 2 : 4;
unsigned sr_count_2 = (size_2 <= 16) ? 2 : 4;
const struct panfrost_compile_inputs *inputs = b->shader->inputs;
uint64_t blend_desc = inputs->blend.bifrost_blend_desc;
enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
/* Workaround for NIR-to-TGSI */
if (b->shader->nir->info.fs.untyped_color_outputs)
regfmt = BI_REGISTER_FORMAT_AUTO;
if (inputs->is_blend && inputs->blend.nr_samples > 1) {
/* Conversion descriptor comes from the compile inputs, pixel
* indices derived at run time based on sample ID */
bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_coverage(b),
bi_imm_u32(blend_desc >> 32),
regfmt, BI_VECSIZE_V4);
} else if (b->shader->inputs->is_blend) {
uint64_t blend_desc = b->shader->inputs->blend.bifrost_blend_desc;
/* Blend descriptor comes from the compile inputs */
/* Put the result in r0 */
bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b),
bi_imm_u32(blend_desc),
bi_imm_u32(blend_desc >> 32),
bi_null(), regfmt, sr_count, 0);
} else {
/* Blend descriptor comes from the FAU RAM. By convention, the
* return address on Bifrost is stored in r48 and will be used
* by the blend shader to jump back to the fragment shader */
bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b),
bi_fau(BIR_FAU_BLEND_0 + rt, false),
bi_fau(BIR_FAU_BLEND_0 + rt, true),
rgba2, regfmt, sr_count, sr_count_2);
}
assert(rt < 8);
b->shader->info.bifrost->blend[rt].type = T;
if (T2)
b->shader->info.bifrost->blend_src1_type = T2;
}
/* Blend shaders do not need to run ATEST since they are dependent on a
* fragment shader that runs it. Blit shaders may not need to run ATEST, since
* ATEST is not needed if early-z is forced, alpha-to-coverage is disabled, and
* there are no writes to the coverage mask. The latter two are satisfied for
* all blit shaders, so we just care about early-z, which blit shaders force
* iff they do not write depth or stencil */
static bool
bi_skip_atest(bi_context *ctx, bool emit_zs)
{
return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend;
}
static void
bi_emit_atest(bi_builder *b, bi_index alpha)
{
b->shader->coverage = bi_atest(b, bi_coverage(b), alpha,
bi_fau(BIR_FAU_ATEST_PARAM, false));
b->shader->emitted_atest = true;
}
static void
bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
{
bool combined = instr->intrinsic ==
nir_intrinsic_store_combined_output_pan;
unsigned writeout = combined ? nir_intrinsic_component(instr) :
PAN_WRITEOUT_C;
bool emit_blend = writeout & (PAN_WRITEOUT_C);
bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S);
unsigned loc = nir_intrinsic_io_semantics(instr).location;
bi_index src0 = bi_src_index(&instr->src[0]);
/* By ISA convention, the coverage mask is stored in R60. The store
* itself will be handled by a subsequent ATEST instruction */
if (loc == FRAG_RESULT_SAMPLE_MASK) {
bi_index orig = bi_coverage(b);
bi_index msaa = bi_load_sysval(b, PAN_SYSVAL_MULTISAMPLED, 1, 0);
bi_index new = bi_lshift_and_i32(b, orig, bi_extract(b, src0, 0), bi_imm_u8(0));
b->shader->coverage =
bi_mux_i32(b, orig, new, msaa, BI_MUX_INT_ZERO);
return;
}
/* Emit ATEST if we have to, note ATEST requires a floating-point alpha
* value, but render target #0 might not be floating point. However the
* alpha value is only used for alpha-to-coverage, a stage which is
* skipped for pure integer framebuffers, so the issue is moot. */
if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) {
nir_alu_type T = nir_intrinsic_src_type(instr);
bi_index rgba = bi_src_index(&instr->src[0]);
bi_index alpha =
(T == nir_type_float16) ? bi_half(bi_extract(b, rgba, 1), true) :
(T == nir_type_float32) ? bi_extract(b, rgba, 3) :
bi_dontcare(b);
/* Don't read out-of-bounds */
if (nir_src_num_components(instr->src[0]) < 4)
alpha = bi_imm_f32(1.0);
bi_emit_atest(b, alpha);
}
if (emit_zs) {
bi_index z = bi_dontcare(b), s = bi_dontcare(b);
if (writeout & PAN_WRITEOUT_Z)
z = bi_src_index(&instr->src[2]);
if (writeout & PAN_WRITEOUT_S)
s = bi_src_index(&instr->src[3]);
b->shader->coverage = bi_zs_emit(b, z, s, bi_coverage(b),
writeout & PAN_WRITEOUT_S,
writeout & PAN_WRITEOUT_Z);
}
if (emit_blend) {
unsigned rt = loc ? (loc - FRAG_RESULT_DATA0) : 0;
bool dual = (writeout & PAN_WRITEOUT_2);
bi_index color = bi_src_index(&instr->src[0]);
bi_index color2 = dual ? bi_src_index(&instr->src[4]) : bi_null();
nir_alu_type T2 = dual ? nir_intrinsic_dest_type(instr) : 0;
/* Explicit copy since BLEND inputs are precoloured to R0-R3,
* TODO: maybe schedule around this or implement in RA as a
* spill */
bool has_mrt = (b->shader->nir->info.outputs_written >> FRAG_RESULT_DATA1);
if (has_mrt) {
bi_index srcs[4] = { color, color, color, color };
unsigned channels[4] = { 0, 1, 2, 3 };
color = bi_temp(b->shader);
bi_make_vec_to(b, color, srcs, channels,
nir_src_num_components(instr->src[0]),
nir_alu_type_get_type_size(nir_intrinsic_src_type(instr)));
}
bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr),
color2, T2, rt);
}
if (b->shader->inputs->is_blend) {
/* Jump back to the fragment shader, return address is stored
* in r48 (see above). On Valhall, only jump if the address is
* nonzero. The check is free there and it implements the "jump
* to 0 terminates the blend shader" that's automatic on
* Bifrost.
*/
if (b->shader->arch >= 8)
bi_branchzi(b, bi_preload(b, 48), bi_preload(b, 48), BI_CMPF_NE);
else
bi_jump(b, bi_preload(b, 48));
}
}
/**
* In a vertex shader, is the specified variable a position output? These kinds
* of outputs are written from position shaders when IDVS is enabled. All other
* outputs are written from the varying shader.
*/
static bool
bi_should_remove_store(nir_intrinsic_instr *intr, enum bi_idvs_mode idvs)
{
nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
switch (sem.location) {
case VARYING_SLOT_POS:
case VARYING_SLOT_PSIZ:
return idvs == BI_IDVS_VARYING;
default:
return idvs == BI_IDVS_POSITION;
}
}
static bool
bifrost_nir_specialize_idvs(nir_builder *b, nir_instr *instr, void *data)
{
enum bi_idvs_mode *idvs = data;
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
if (intr->intrinsic != nir_intrinsic_store_output)
return false;
if (bi_should_remove_store(intr, *idvs)) {
nir_instr_remove(instr);
return true;
}
return false;
}
static void
bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
{
/* In principle we can do better for 16-bit. At the moment we require
* 32-bit to permit the use of .auto, in order to force .u32 for flat
* varyings, to handle internal TGSI shaders that set flat in the VS
* but smooth in the FS */
ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr);
ASSERTED unsigned T_size = nir_alu_type_get_type_size(T);
assert(T_size == 32 || (b->shader->arch >= 9 && T_size == 16));
enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
unsigned imm_index = 0;
bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
/* Only look at the total components needed. In effect, we fill in all
* the intermediate "holes" in the write mask, since we can't mask off
* stores. Since nir_lower_io_to_temporaries ensures each varying is
* written at most once, anything that's masked out is undefined, so it
* doesn't matter what we write there. So we may as well do the
* simplest thing possible. */
unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr));
assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0));
bi_index data = bi_src_index(&instr->src[0]);
/* To keep the vector dimensions consistent, we need to drop some
* components. This should be coalesced.
*
* TODO: This is ugly and maybe inefficient. Would we rather
* introduce a TRIM.i32 pseudoinstruction?
*/
if (nr < nir_intrinsic_src_components(instr, 0)) {
assert(T_size == 32 && "todo: 16-bit trim");
bi_index chans[4] = { bi_null(), bi_null(), bi_null(), bi_null() };
unsigned src_comps = nir_intrinsic_src_components(instr, 0);
bi_emit_split_i32(b, chans, data, src_comps);
bi_index tmp = bi_temp(b->shader);
bi_instr *collect = bi_collect_i32_to(b, tmp, nr);
bi_foreach_src(collect, w)
collect->src[w] = chans[w];
data = tmp;
}
bool psiz = (nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PSIZ);
bi_index a[4] = { bi_null() };
if (b->shader->arch <= 8 && b->shader->idvs == BI_IDVS_POSITION) {
/* Bifrost position shaders have a fast path */
assert(T == nir_type_float16 || T == nir_type_float32);
unsigned regfmt = (T == nir_type_float16) ? 0 : 1;
unsigned identity = (b->shader->arch == 6) ? 0x688 : 0;
unsigned snap4 = 0x5E;
uint32_t format = identity | (snap4 << 12) | (regfmt << 24);
bi_st_cvt(b, data, bi_preload(b, 58), bi_preload(b, 59),
bi_imm_u32(format), regfmt, nr - 1);
} else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) {
bi_index index = bi_preload(b, 59);
if (psiz) {
assert(T_size == 16 && "should've been lowered");
index = bi_iadd_imm_i32(b, index, 4);
}
bi_index address = bi_lea_buf_imm(b, index);
bi_emit_split_i32(b, a, address, 2);
bool varying = (b->shader->idvs == BI_IDVS_VARYING);
bi_store(b, nr * nir_src_bit_size(instr->src[0]),
data, a[0], a[1],
varying ? BI_SEG_VARY : BI_SEG_POS,
varying ? bi_varying_offset(b->shader, instr) : 0);
} else if (immediate) {
bi_index address = bi_lea_attr_imm(b,
bi_vertex_id(b), bi_instance_id(b),
regfmt, imm_index);
bi_emit_split_i32(b, a, address, 3);
bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
} else {
bi_index idx =
bi_iadd_u32(b,
bi_src_index(nir_get_io_offset_src(instr)),
bi_imm_u32(nir_intrinsic_base(instr)),
false);
bi_index address = bi_lea_attr(b,
bi_vertex_id(b), bi_instance_id(b),
idx, regfmt);
bi_emit_split_i32(b, a, address, 3);
bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
}
}
static void
bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr)
{
nir_src *offset = nir_get_io_offset_src(instr);
bool offset_is_const = nir_src_is_const(*offset);
bi_index dyn_offset = bi_src_index(offset);
uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0;
bi_load_ubo_to(b, instr->num_components * nir_dest_bit_size(instr->dest),
bi_dest_index(&instr->dest), offset_is_const ?
bi_imm_u32(const_offset) : dyn_offset,
bi_src_index(&instr->src[0]));
}
static void
bi_emit_load_push_constant(bi_builder *b, nir_intrinsic_instr *instr)
{
assert(b->shader->inputs->no_ubo_to_push && "can't mix push constant forms");
nir_src *offset = &instr->src[0];
assert(nir_src_is_const(*offset) && "no indirect push constants");
uint32_t base = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
assert((base & 3) == 0 && "unaligned push constants");
unsigned bits = nir_dest_bit_size(instr->dest) *
nir_dest_num_components(instr->dest);
unsigned n = DIV_ROUND_UP(bits, 32);
assert(n <= 4);
bi_index channels[4] = { bi_null() };
for (unsigned i = 0; i < n; ++i) {
unsigned word = (base >> 2) + i;
channels[i] = bi_fau(BIR_FAU_UNIFORM | (word >> 1), word & 1);
}
bi_emit_collect_to(b, bi_dest_index(&instr->dest), channels, n);
}
static bi_index
bi_addr_high(bi_builder *b, nir_src *src)
{
return (nir_src_bit_size(*src) == 64) ?
bi_extract(b, bi_src_index(src), 1) : bi_zero();
}
static void
bi_handle_segment(bi_builder *b, bi_index *addr_lo, bi_index *addr_hi, enum bi_seg seg, int16_t *offset)
{
/* Not needed on Bifrost or for global accesses */
if (b->shader->arch < 9 || seg == BI_SEG_NONE)
return;
/* There is no segment modifier on Valhall. Instead, we need to
* emit the arithmetic ourselves. We do have an offset
* available, which saves an instruction for constant offsets.
*/
bool wls = (seg == BI_SEG_WLS);
assert(wls || (seg == BI_SEG_TL));
enum bir_fau fau = wls ? BIR_FAU_WLS_PTR : BIR_FAU_TLS_PTR;
bi_index base_lo = bi_fau(fau, false);
if (offset && addr_lo->type == BI_INDEX_CONSTANT && addr_lo->value == (int16_t) addr_lo->value) {
*offset = addr_lo->value;
*addr_lo = base_lo;
} else {
*addr_lo = bi_iadd_u32(b, base_lo, *addr_lo, false);
}
/* Do not allow overflow for WLS or TLS */
*addr_hi = bi_fau(fau, true);
}
static void
bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
{
int16_t offset = 0;
unsigned bits = instr->num_components * nir_dest_bit_size(instr->dest);
bi_index dest = bi_dest_index(&instr->dest);
bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[0]), 0);
bi_index addr_hi = bi_addr_high(b, &instr->src[0]);
bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset);
bi_load_to(b, bits, dest, addr_lo, addr_hi, seg, offset);
bi_emit_cached_split(b, dest, bits);
}
static void
bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
{
/* Require contiguous masks, gauranteed by nir_lower_wrmasks */
assert(nir_intrinsic_write_mask(instr) ==
BITFIELD_MASK(instr->num_components));
int16_t offset = 0;
bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[1]), 0);
bi_index addr_hi = bi_addr_high(b, &instr->src[1]);
bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset);
bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]),
bi_src_index(&instr->src[0]),
addr_lo, addr_hi, seg, offset);
}
/* Exchanges the staging register with memory */
static void
bi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg, enum bi_seg seg)
{
assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
unsigned sz = nir_src_bit_size(*arg);
assert(sz == 32 || sz == 64);
bi_index data = bi_src_index(arg);
bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1);
if (b->shader->arch >= 9)
bi_handle_segment(b, &addr, &addr_hi, seg, NULL);
else if (seg == BI_SEG_WLS)
addr_hi = bi_zero();
bi_axchg_to(b, sz, dst, data, bi_extract(b, addr, 0), addr_hi, seg);
}
/* Exchanges the second staging register with memory if comparison with first
* staging register passes */
static void
bi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1, nir_src *arg_2, enum bi_seg seg)
{
assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
/* hardware is swapped from NIR */
bi_index src0 = bi_src_index(arg_2);
bi_index src1 = bi_src_index(arg_1);
unsigned sz = nir_src_bit_size(*arg_1);
assert(sz == 32 || sz == 64);
bi_index data_words[] = {
bi_extract(b, src0, 0),
sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src0, 1),
/* 64-bit */
bi_extract(b, src1, 0),
sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src1, 1),
};
bi_index in = bi_temp(b->shader);
bi_emit_collect_to(b, in, data_words, 2 * (sz / 32));
bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1);
if (b->shader->arch >= 9)
bi_handle_segment(b, &addr, &addr_hi, seg, NULL);
else if (seg == BI_SEG_WLS)
addr_hi = bi_zero();
bi_index out = bi_acmpxchg(b, sz, in, bi_extract(b, addr, 0), addr_hi, seg);
bi_emit_cached_split(b, out, sz);
bi_index inout_words[] = {
bi_extract(b, out, 0),
sz == 64 ? bi_extract(b, out, 1) : bi_null()
};
bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32);
}
/* Extracts an atomic opcode */
static enum bi_atom_opc
bi_atom_opc_for_nir(nir_intrinsic_op op)
{
switch (op) {
case nir_intrinsic_global_atomic_add:
case nir_intrinsic_shared_atomic_add:
case nir_intrinsic_image_atomic_add:
return BI_ATOM_OPC_AADD;
case nir_intrinsic_global_atomic_imin:
case nir_intrinsic_shared_atomic_imin:
case nir_intrinsic_image_atomic_imin:
return BI_ATOM_OPC_ASMIN;
case nir_intrinsic_global_atomic_umin:
case nir_intrinsic_shared_atomic_umin:
case nir_intrinsic_image_atomic_umin:
return BI_ATOM_OPC_AUMIN;
case nir_intrinsic_global_atomic_imax:
case nir_intrinsic_shared_atomic_imax:
case nir_intrinsic_image_atomic_imax:
return BI_ATOM_OPC_ASMAX;
case nir_intrinsic_global_atomic_umax:
case nir_intrinsic_shared_atomic_umax:
case nir_intrinsic_image_atomic_umax:
return BI_ATOM_OPC_AUMAX;
case nir_intrinsic_global_atomic_and:
case nir_intrinsic_shared_atomic_and:
case nir_intrinsic_image_atomic_and:
return BI_ATOM_OPC_AAND;
case nir_intrinsic_global_atomic_or:
case nir_intrinsic_shared_atomic_or:
case nir_intrinsic_image_atomic_or:
return BI_ATOM_OPC_AOR;
case nir_intrinsic_global_atomic_xor:
case nir_intrinsic_shared_atomic_xor:
case nir_intrinsic_image_atomic_xor:
return BI_ATOM_OPC_AXOR;
default:
unreachable("Unexpected computational atomic");
}
}
/* Optimized unary atomics are available with an implied #1 argument */
static bool
bi_promote_atom_c1(enum bi_atom_opc op, bi_index arg, enum bi_atom_opc *out)
{
/* Check we have a compatible constant */
if (arg.type != BI_INDEX_CONSTANT)
return false;
if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD)))
return false;
/* Check for a compatible operation */
switch (op) {
case BI_ATOM_OPC_AADD:
*out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC;
return true;
case BI_ATOM_OPC_ASMAX:
*out = BI_ATOM_OPC_ASMAX1;
return true;
case BI_ATOM_OPC_AUMAX:
*out = BI_ATOM_OPC_AUMAX1;
return true;
case BI_ATOM_OPC_AOR:
*out = BI_ATOM_OPC_AOR1;
return true;
default:
return false;
}
}
/*
* Coordinates are 16-bit integers in Bifrost but 32-bit in NIR. We need to
* translate between these forms (with MKVEC.v2i16).
*
* Aditionally on Valhall, cube maps in the attribute pipe are treated as 2D
* arrays. For uniform handling, we also treat 3D textures like 2D arrays.
*
* Our indexing needs to reflects this.
*/
static bi_index
bi_emit_image_coord(bi_builder *b, bi_index coord, unsigned src_idx,
unsigned coord_comps, bool is_array)
{
assert(coord_comps > 0 && coord_comps <= 3);
if (src_idx == 0) {
if (coord_comps == 1 || (coord_comps == 2 && is_array))
return bi_extract(b, coord, 0);
else
return bi_mkvec_v2i16(b,
bi_half(bi_extract(b, coord, 0), false),
bi_half(bi_extract(b, coord, 1), false));
} else {
if (coord_comps == 3 && b->shader->arch >= 9)
return bi_mkvec_v2i16(b, bi_imm_u16(0),
bi_half(bi_extract(b, coord, 2), false));
else if (coord_comps == 2 && is_array && b->shader->arch >= 9)
return bi_mkvec_v2i16(b, bi_imm_u16(0),
bi_half(bi_extract(b, coord, 1), false));
else if (coord_comps == 3)
return bi_extract(b, coord, 2);
else if (coord_comps == 2 && is_array)
return bi_extract(b, coord, 1);
else
return bi_zero();
}
}
static bi_index
bi_emit_image_index(bi_builder *b, nir_intrinsic_instr *instr)
{
nir_src src = instr->src[0];
bi_index index = bi_src_index(&src);
bi_context *ctx = b->shader;
/* Images come after vertex attributes, so handle an explicit offset */
unsigned offset = (ctx->stage == MESA_SHADER_VERTEX) ?
util_bitcount64(ctx->nir->info.inputs_read) : 0;
if (offset == 0)
return index;
else if (nir_src_is_const(src))
return bi_imm_u32(nir_src_as_uint(src) + offset);
else
return bi_iadd_u32(b, index, bi_imm_u32(offset), false);
}
static void
bi_emit_image_load(bi_builder *b, nir_intrinsic_instr *instr)
{
enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
bool array = nir_intrinsic_image_array(instr);
ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);
bi_index coords = bi_src_index(&instr->src[1]);
bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array);
bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array);
bi_index dest = bi_dest_index(&instr->dest);
enum bi_register_format regfmt = bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr));
enum bi_vecsize vecsize = instr->num_components - 1;
/* TODO: MSAA */
assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");
if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) {
bi_instr *I = bi_ld_tex_imm_to(b, dest, xy, zw, regfmt, vecsize,
nir_src_as_uint(instr->src[0]));
I->table = PAN_TABLE_IMAGE;
} else if (b->shader->arch >= 9) {
unreachable("Indirect images on Valhall not yet supported");
} else {
bi_ld_attr_tex_to(b, dest, xy, zw,
bi_emit_image_index(b, instr), regfmt,
vecsize);
}
bi_split_dest(b, instr->dest);
}
static bi_index
bi_emit_lea_image(bi_builder *b, nir_intrinsic_instr *instr)
{
enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
bool array = nir_intrinsic_image_array(instr);
ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);
unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
/* TODO: MSAA */
assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");
enum bi_register_format type = (instr->intrinsic == nir_intrinsic_image_store) ?
bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)) :
BI_REGISTER_FORMAT_AUTO;
bi_index coords = bi_src_index(&instr->src[1]);
bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array);
bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array);
bi_index dest = bi_temp(b->shader);
if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) {
bi_instr *I = bi_lea_tex_imm_to(b, dest, xy, zw, false,
nir_src_as_uint(instr->src[0]));
I->table = PAN_TABLE_IMAGE;
} else if (b->shader->arch >= 9) {
unreachable("Indirect images on Valhall not yet supported");
} else {
bi_instr *I = bi_lea_attr_tex_to(b, dest, xy, zw,
bi_emit_image_index(b, instr), type);
/* LEA_ATTR_TEX defaults to the secondary attribute table, but
* our ABI has all images in the primary attribute table
*/
I->table = BI_TABLE_ATTRIBUTE_1;
}
bi_emit_cached_split(b, dest, 3 * 32);
return dest;
}
static void
bi_emit_image_store(bi_builder *b, nir_intrinsic_instr *instr)
{
bi_index a[4] = { bi_null() };
bi_emit_split_i32(b, a, bi_emit_lea_image(b, instr), 3);
/* Due to SPIR-V limitations, the source type is not fully reliable: it
* reports uint32 even for write_imagei. This causes an incorrect
* u32->s32->u32 roundtrip which incurs an unwanted clamping. Use auto32
* instead, which will match per the OpenCL spec. Of course this does
* not work for 16-bit stores, but those are not available in OpenCL.
*/
nir_alu_type T = nir_intrinsic_src_type(instr);
assert(nir_alu_type_get_type_size(T) == 32);
bi_st_cvt(b, bi_src_index(&instr->src[3]), a[0], a[1], a[2],
BI_REGISTER_FORMAT_AUTO,
instr->num_components - 1);
}
static void
bi_emit_atomic_i32_to(bi_builder *b, bi_index dst,
bi_index addr, bi_index arg, nir_intrinsic_op intrinsic)
{
enum bi_atom_opc opc = bi_atom_opc_for_nir(intrinsic);
enum bi_atom_opc post_opc = opc;
bool bifrost = b->shader->arch <= 8;
/* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't
* take any vector but can still output in RETURN mode */
bi_index tmp_dest = bifrost ? bi_temp(b->shader) : dst;
unsigned sr_count = bifrost ? 2 : 1;
/* Generate either ATOM or ATOM1 as required */
if (bi_promote_atom_c1(opc, arg, &opc)) {
bi_atom1_return_i32_to(b, tmp_dest, bi_extract(b, addr, 0),
bi_extract(b, addr, 1), opc, sr_count);
} else {
bi_atom_return_i32_to(b, tmp_dest, arg, bi_extract(b, addr, 0),
bi_extract(b, addr, 1), opc, sr_count);
}
if (bifrost) {
/* Post-process it */
bi_emit_cached_split_i32(b, tmp_dest, 2);
bi_atom_post_i32_to(b, dst, bi_extract(b, tmp_dest, 0), bi_extract(b, tmp_dest, 1), post_opc);
}
}
/* gl_FragCoord.xy = u16_to_f32(R59.xy) + 0.5
* gl_FragCoord.z = ld_vary(fragz)
* gl_FragCoord.w = ld_vary(fragw)
*/
static void
bi_emit_load_frag_coord(bi_builder *b, nir_intrinsic_instr *instr)
{
bi_index src[4] = {};
for (unsigned i = 0; i < 2; ++i) {
src[i] = bi_fadd_f32(b,
bi_u16_to_f32(b, bi_half(bi_preload(b, 59), i)),
bi_imm_f32(0.5f));
}
for (unsigned i = 0; i < 2; ++i) {
src[2 + i] = bi_ld_var_special(b, bi_zero(),
BI_REGISTER_FORMAT_F32, BI_SAMPLE_CENTER,
BI_UPDATE_CLOBBER,
(i == 0) ? BI_VARYING_NAME_FRAG_Z :
BI_VARYING_NAME_FRAG_W,
BI_VECSIZE_NONE);
}
bi_make_vec_to(b, bi_dest_index(&instr->dest), src, NULL, 4, 32);
}
static void
bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr)
{
bi_index dest = bi_dest_index(&instr->dest);
nir_alu_type T = nir_intrinsic_dest_type(instr);
enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
unsigned rt = b->shader->inputs->blend.rt;
unsigned size = nir_dest_bit_size(instr->dest);
unsigned nr = instr->num_components;
/* Get the render target */
if (!b->shader->inputs->is_blend) {
nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
unsigned loc = sem.location;
assert(loc >= FRAG_RESULT_DATA0);
rt = (loc - FRAG_RESULT_DATA0);
}
bi_index desc = b->shader->inputs->is_blend ?
bi_imm_u32(b->shader->inputs->blend.bifrost_blend_desc >> 32) :
b->shader->inputs->bifrost.static_rt_conv ?
bi_imm_u32(b->shader->inputs->bifrost.rt_conv[rt]) :
bi_load_sysval(b, PAN_SYSVAL(RT_CONVERSION, rt | (size << 4)), 1, 0);
bi_ld_tile_to(b, dest, bi_pixel_indices(b, rt), bi_coverage(b), desc,
regfmt, nr - 1);
bi_emit_cached_split(b, dest, size * nr);
}
static void
bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
{
bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ?
bi_dest_index(&instr->dest) : bi_null();
gl_shader_stage stage = b->shader->stage;
switch (instr->intrinsic) {
case nir_intrinsic_load_barycentric_pixel:
case nir_intrinsic_load_barycentric_centroid:
case nir_intrinsic_load_barycentric_sample:
case nir_intrinsic_load_barycentric_at_sample:
case nir_intrinsic_load_barycentric_at_offset:
/* handled later via load_vary */
break;
case nir_intrinsic_load_interpolated_input:
case nir_intrinsic_load_input:
if (b->shader->inputs->is_blend)
bi_emit_load_blend_input(b, instr);
else if (stage == MESA_SHADER_FRAGMENT)
bi_emit_load_vary(b, instr);
else if (stage == MESA_SHADER_VERTEX)
bi_emit_load_attr(b, instr);
else
unreachable("Unsupported shader stage");
break;
case nir_intrinsic_store_output:
if (stage == MESA_SHADER_FRAGMENT)
bi_emit_fragment_out(b, instr);
else if (stage == MESA_SHADER_VERTEX)
bi_emit_store_vary(b, instr);
else
unreachable("Unsupported shader stage");
break;
case nir_intrinsic_store_combined_output_pan:
assert(stage == MESA_SHADER_FRAGMENT);
bi_emit_fragment_out(b, instr);
break;
case nir_intrinsic_load_ubo:
bi_emit_load_ubo(b, instr);
break;
case nir_intrinsic_load_push_constant:
bi_emit_load_push_constant(b, instr);
break;
case nir_intrinsic_load_global:
case nir_intrinsic_load_global_constant:
bi_emit_load(b, instr, BI_SEG_NONE);
break;
case nir_intrinsic_store_global:
bi_emit_store(b, instr, BI_SEG_NONE);
break;
case nir_intrinsic_load_scratch:
bi_emit_load(b, instr, BI_SEG_TL);
break;
case nir_intrinsic_store_scratch:
bi_emit_store(b, instr, BI_SEG_TL);
break;
case nir_intrinsic_load_shared:
bi_emit_load(b, instr, BI_SEG_WLS);
break;
case nir_intrinsic_store_shared:
bi_emit_store(b, instr, BI_SEG_WLS);
break;
/* Blob doesn't seem to do anything for memory barriers, note +BARRIER
* is illegal in fragment shaders */
case nir_intrinsic_memory_barrier:
case nir_intrinsic_memory_barrier_buffer:
case nir_intrinsic_memory_barrier_image:
case nir_intrinsic_memory_barrier_shared:
case nir_intrinsic_group_memory_barrier:
break;
case nir_intrinsic_control_barrier:
assert(b->shader->stage != MESA_SHADER_FRAGMENT);
bi_barrier(b);
break;
case nir_intrinsic_scoped_barrier:
assert(b->shader->stage != MESA_SHADER_FRAGMENT);
assert(nir_intrinsic_memory_scope(instr) > NIR_SCOPE_SUBGROUP &&
"todo: subgroup barriers (different divergence rules)");
bi_barrier(b);
break;
case nir_intrinsic_shared_atomic_add:
case nir_intrinsic_shared_atomic_imin:
case nir_intrinsic_shared_atomic_umin:
case nir_intrinsic_shared_atomic_imax:
case nir_intrinsic_shared_atomic_umax:
case nir_intrinsic_shared_atomic_and:
case nir_intrinsic_shared_atomic_or:
case nir_intrinsic_shared_atomic_xor: {
assert(nir_src_bit_size(instr->src[1]) == 32);
bi_index addr = bi_src_index(&instr->src[0]);
bi_index addr_hi;
if (b->shader->arch >= 9) {
bi_handle_segment(b, &addr, &addr_hi, BI_SEG_WLS, NULL);
addr = bi_collect_v2i32(b, addr, addr_hi);
} else {
addr = bi_seg_add_i64(b, addr, bi_zero(), false, BI_SEG_WLS);
bi_emit_cached_split(b, addr, 64);
}
bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]),
instr->intrinsic);
bi_split_dest(b, instr->dest);
break;
}
case nir_intrinsic_image_atomic_add:
case nir_intrinsic_image_atomic_imin:
case nir_intrinsic_image_atomic_umin:
case nir_intrinsic_image_atomic_imax:
case nir_intrinsic_image_atomic_umax:
case nir_intrinsic_image_atomic_and:
case nir_intrinsic_image_atomic_or:
case nir_intrinsic_image_atomic_xor:
assert(nir_src_bit_size(instr->src[3]) == 32);
bi_emit_atomic_i32_to(b, dst,
bi_emit_lea_image(b, instr),
bi_src_index(&instr->src[3]),
instr->intrinsic);
bi_split_dest(b, instr->dest);
break;
case nir_intrinsic_global_atomic_add:
case nir_intrinsic_global_atomic_imin:
case nir_intrinsic_global_atomic_umin:
case nir_intrinsic_global_atomic_imax:
case nir_intrinsic_global_atomic_umax:
case nir_intrinsic_global_atomic_and:
case nir_intrinsic_global_atomic_or:
case nir_intrinsic_global_atomic_xor:
assert(nir_src_bit_size(instr->src[1]) == 32);
bi_emit_atomic_i32_to(b, dst,
bi_src_index(&instr->src[0]),
bi_src_index(&instr->src[1]),
instr->intrinsic);
bi_split_dest(b, instr->dest);
break;
case nir_intrinsic_image_load:
bi_emit_image_load(b, instr);
break;
case nir_intrinsic_image_store:
bi_emit_image_store(b, instr);
break;
case nir_intrinsic_global_atomic_exchange:
bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]),
&instr->src[1], BI_SEG_NONE);
bi_split_dest(b, instr->dest);
break;
case nir_intrinsic_image_atomic_exchange:
bi_emit_axchg_to(b, dst, bi_emit_lea_image(b, instr),
&instr->src[3], BI_SEG_NONE);
bi_split_dest(b, instr->dest);
break;
case nir_intrinsic_shared_atomic_exchange:
bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]),
&instr->src[1], BI_SEG_WLS);
bi_split_dest(b, instr->dest);
break;
case nir_intrinsic_global_atomic_comp_swap:
bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]),
&instr->src[1], &instr->src[2], BI_SEG_NONE);
bi_split_dest(b, instr->dest);
break;
case nir_intrinsic_image_atomic_comp_swap:
bi_emit_acmpxchg_to(b, dst, bi_emit_lea_image(b, instr),
&instr->src[3], &instr->src[4], BI_SEG_NONE);
bi_split_dest(b, instr->dest);
break;
case nir_intrinsic_shared_atomic_comp_swap:
bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]),
&instr->src[1], &instr->src[2], BI_SEG_WLS);
bi_split_dest(b, instr->dest);
break;
case nir_intrinsic_load_frag_coord:
bi_emit_load_frag_coord(b, instr);
break;
case nir_intrinsic_load_output:
bi_emit_ld_tile(b, instr);
break;
case nir_intrinsic_discard_if:
bi_discard_b32(b, bi_src_index(&instr->src[0]));
break;
case nir_intrinsic_discard:
bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ);
break;
case nir_intrinsic_load_ssbo_address:
case nir_intrinsic_load_xfb_address:
bi_load_sysval_nir(b, instr, 2, 0);
break;
case nir_intrinsic_load_work_dim:
case nir_intrinsic_load_num_vertices:
case nir_intrinsic_load_first_vertex:
case nir_intrinsic_load_draw_id:
bi_load_sysval_nir(b, instr, 1, 0);
break;
case nir_intrinsic_load_base_vertex:
bi_load_sysval_nir(b, instr, 1, 4);
break;
case nir_intrinsic_load_base_instance:
case nir_intrinsic_get_ssbo_size:
bi_load_sysval_nir(b, instr, 1, 8);
break;
case nir_intrinsic_load_viewport_scale:
case nir_intrinsic_load_viewport_offset:
case nir_intrinsic_load_num_workgroups:
case nir_intrinsic_load_workgroup_size:
bi_load_sysval_nir(b, instr, 3, 0);
break;
case nir_intrinsic_image_size:
bi_load_sysval_nir(b, instr,
nir_dest_num_components(instr->dest), 0);
break;
case nir_intrinsic_load_blend_const_color_rgba:
bi_load_sysval_nir(b, instr,
nir_dest_num_components(instr->dest), 0);
break;
case nir_intrinsic_load_sample_positions_pan:
bi_collect_v2i32_to(b, dst,
bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false),
bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true));
break;
case nir_intrinsic_load_sample_mask_in:
/* r61[0:15] contains the coverage bitmap */
bi_u16_to_u32_to(b, dst, bi_half(bi_preload(b, 61), false));
break;
case nir_intrinsic_load_sample_id:
bi_load_sample_id_to(b, dst);
break;
case nir_intrinsic_load_front_face:
/* r58 == 0 means primitive is front facing */
bi_icmp_i32_to(b, dst, bi_preload(b, 58), bi_zero(), BI_CMPF_EQ,
BI_RESULT_TYPE_M1);
break;
case nir_intrinsic_load_point_coord:
bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32,
BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER,
BI_VARYING_NAME_POINT, BI_VECSIZE_V2);
bi_emit_cached_split_i32(b, dst, 2);
break;
/* It appears vertex_id is zero-based with Bifrost geometry flows, but
* not with Valhall's memory-allocation IDVS geometry flow. Ostensibly
* we support the legacy geometry flow even on Valhall, so
* vertex_id_zero_based isn't a machine property for us. Don't set it,
* and lower here if needed.
*/
case nir_intrinsic_load_vertex_id:
if (b->shader->malloc_idvs) {
bi_mov_i32_to(b, dst, bi_vertex_id(b));
} else {
bi_index first = bi_load_sysval(b,
PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS,
1, 0);
bi_iadd_u32_to(b, dst, bi_vertex_id(b), first, false);
}
break;
/* We only use in our transform feedback lowering */
case nir_intrinsic_load_vertex_id_zero_base:
assert(b->shader->nir->info.has_transform_feedback_varyings);
bi_mov_i32_to(b, dst, bi_vertex_id(b));
break;
case nir_intrinsic_load_instance_id:
bi_mov_i32_to(b, dst, bi_instance_id(b));
break;
case nir_intrinsic_load_subgroup_invocation:
bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false));
break;
case nir_intrinsic_load_local_invocation_id:
bi_collect_v3i32_to(b, dst,
bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 0)),
bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 1)),
bi_u16_to_u32(b, bi_half(bi_preload(b, 56), 0)));
break;
case nir_intrinsic_load_workgroup_id:
bi_collect_v3i32_to(b, dst, bi_preload(b, 57), bi_preload(b, 58),
bi_preload(b, 59));
break;
case nir_intrinsic_load_global_invocation_id:
case nir_intrinsic_load_global_invocation_id_zero_base:
bi_collect_v3i32_to(b, dst, bi_preload(b, 60), bi_preload(b, 61),
bi_preload(b, 62));
break;
case nir_intrinsic_shader_clock:
bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER);
bi_split_dest(b, instr->dest);
break;
default:
fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);
assert(0);
}
}
static void
bi_emit_load_const(bi_builder *b, nir_load_const_instr *instr)
{
/* Make sure we've been lowered */
assert(instr->def.num_components <= (32 / instr->def.bit_size));
/* Accumulate all the channels of the constant, as if we did an
* implicit SEL over them */
uint32_t acc = 0;
for (unsigned i = 0; i < instr->def.num_components; ++i) {
unsigned v = nir_const_value_as_uint(instr->value[i], instr->def.bit_size);
acc |= (v << (i * instr->def.bit_size));
}
bi_mov_i32_to(b, bi_get_index(instr->def.index), bi_imm_u32(acc));
}
static bi_index
bi_alu_src_index(bi_builder *b, nir_alu_src src, unsigned comps)
{
/* we don't lower modifiers until the backend */
assert(!(src.negate || src.abs));
unsigned bitsize = nir_src_bit_size(src.src);
/* the bi_index carries the 32-bit (word) offset separate from the
* subword swizzle, first handle the offset */
unsigned offset = 0;
assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
for (unsigned i = 0; i < comps; ++i) {
unsigned new_offset = (src.swizzle[i] >> subword_shift);
if (i > 0)
assert(offset == new_offset && "wrong vectorization");
offset = new_offset;
}
bi_index idx = bi_extract(b, bi_src_index(&src.src), offset);
/* Compose the subword swizzle with existing (identity) swizzle */
assert(idx.swizzle == BI_SWIZZLE_H01);
/* Bigger vectors should have been lowered */
assert(comps <= (1 << subword_shift));
if (bitsize == 16) {
unsigned c0 = src.swizzle[0] & 1;
unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0;
idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1);
} else if (bitsize == 8) {
/* 8-bit vectors not yet supported */
assert(comps == 1 && "8-bit vectors not supported");
idx.swizzle = BI_SWIZZLE_B0000 + (src.swizzle[0] & 3);
}
return idx;
}
static enum bi_round
bi_nir_round(nir_op op)
{
switch (op) {
case nir_op_fround_even: return BI_ROUND_NONE;
case nir_op_ftrunc: return BI_ROUND_RTZ;
case nir_op_fceil: return BI_ROUND_RTP;
case nir_op_ffloor: return BI_ROUND_RTN;
default: unreachable("invalid nir round op");
}
}
/* Convenience for lowered transcendentals */
static bi_index
bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1)
{
return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f));
}
/* Approximate with FRCP_APPROX.f32 and apply a single iteration of
* Newton-Raphson to improve precision */
static void
bi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0)
{
bi_index x1 = bi_frcp_approx_f32(b, s0);
bi_index m = bi_frexpm_f32(b, s0, false, false);
bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, false);
bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0),
bi_zero(), BI_SPECIAL_N);
bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e, BI_SPECIAL_NONE);
}
static void
bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0)
{
bi_index x1 = bi_frsq_approx_f32(b, s0);
bi_index m = bi_frexpm_f32(b, s0, false, true);
bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, true);
bi_index t1 = bi_fmul_f32(b, x1, x1);
bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0),
bi_imm_u32(-1), BI_SPECIAL_N);
bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e, BI_SPECIAL_N);
}
/* More complex transcendentals, see
* https://gitlab.freedesktop.org/panfrost/mali-isa-docs/-/blob/master/Bifrost.adoc
* for documentation */
static void
bi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0)
{
bi_index t1 = bi_temp(b->shader);
bi_instr *t1_instr = bi_fadd_f32_to(b, t1, s0, bi_imm_u32(0x49400000));
t1_instr->clamp = BI_CLAMP_CLAMP_0_INF;
bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000));
bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader), s0, bi_neg(t2));
a2->clamp = BI_CLAMP_CLAMP_M1_1;
bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE);
bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false);
bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4));
bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635),
bi_imm_u32(0x3e75fffa));
bi_index p2 = bi_fma_f32(b, p1, a2->dest[0], bi_imm_u32(0x3f317218));
bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2);
bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader),
p3, a1t, a1t, a1i, BI_SPECIAL_NONE);
x->clamp = BI_CLAMP_CLAMP_0_INF;
bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0);
max->sem = BI_SEM_NAN_PROPAGATE;
}
static void
bi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base)
{
/* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24
* fixed-point input */
bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(),
bi_imm_u32(24), BI_SPECIAL_NONE);
bi_instr *fixed_pt = bi_f32_to_s32_to(b, bi_temp(b->shader), scale);
fixed_pt->round = BI_ROUND_NONE; // XXX
/* Compute the result for the fixed-point input, but pass along
* the floating-point scale for correct NaN propagation */
bi_fexp_f32_to(b, dst, fixed_pt->dest[0], scale);
}
static void
bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
{
/* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */
bi_index a1 = bi_frexpm_f32(b, s0, true, false);
bi_index ei = bi_frexpe_f32(b, s0, true, false);
bi_index ef = bi_s32_to_f32(b, ei);
/* xt estimates -log(r1), a coarse approximation of log(a1) */
bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE);
bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE);
/* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) -
* log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1),
* and then log(s0) = x1 + x2 */
bi_index x1 = bi_fadd_f32(b, ef, xt);
/* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by
* polynomial approximation around 1. The series is expressed around
* 1, so set y = (a1 * r1) - 1.0 */
bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0));
/* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate
* log_e(1 + y) by the Taylor series (lower precision than the blob):
* y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */
bi_index loge = bi_fmul_f32(b, y,
bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0)));
bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0)));
/* log(s0) = x1 + x2 */
bi_fadd_f32_to(b, dst, x1, x2);
}
static void
bi_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
{
bi_index frexp = bi_frexpe_f32(b, s0, true, false);
bi_index frexpi = bi_s32_to_f32(b, frexp);
bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0);
bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi);
}
static void
bi_lower_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
{
bi_index log2_base = bi_null();
if (base.type == BI_INDEX_CONSTANT) {
log2_base = bi_imm_f32(log2f(uif(base.value)));
} else {
log2_base = bi_temp(b->shader);
bi_lower_flog2_32(b, log2_base, base);
}
return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base));
}
static void
bi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
{
bi_index log2_base = bi_null();
if (base.type == BI_INDEX_CONSTANT) {
log2_base = bi_imm_f32(log2f(uif(base.value)));
} else {
log2_base = bi_temp(b->shader);
bi_flog2_32(b, log2_base, base);
}
return bi_fexp_32(b, dst, exp, log2_base);
}
/* Bifrost has extremely coarse tables for approximating sin/cos, accessible as
* FSIN/COS_TABLE.u6, which multiplies the bottom 6-bits by pi/32 and
* calculates the results. We use them to calculate sin/cos via a Taylor
* approximation:
*
* f(x + e) = f(x) + e f'(x) + (e^2)/2 f''(x)
* sin(x + e) = sin(x) + e cos(x) - (e^2)/2 sin(x)
* cos(x + e) = cos(x) - e sin(x) - (e^2)/2 cos(x)
*/
#define TWO_OVER_PI bi_imm_f32(2.0f / 3.14159f)
#define MPI_OVER_TWO bi_imm_f32(-3.14159f / 2.0)
#define SINCOS_BIAS bi_imm_u32(0x49400000)
static void
bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos)
{
/* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */
bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS);
/* Approximate domain error (small) */
bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS)),
MPI_OVER_TWO, s0);
/* Lookup sin(x), cos(x) */
bi_index sinx = bi_fsin_table_u6(b, x_u6, false);
bi_index cosx = bi_fcos_table_u6(b, x_u6, false);
/* e^2 / 2 */
bi_index e2_over_2 = bi_fma_rscale_f32(b, e, e, bi_negzero(),
bi_imm_u32(-1), BI_SPECIAL_NONE);
/* (-e^2)/2 f''(x) */
bi_index quadratic = bi_fma_f32(b, bi_neg(e2_over_2),
cos ? cosx : sinx,
bi_negzero());
/* e f'(x) - (e^2/2) f''(x) */
bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e,
cos ? bi_neg(sinx) : cosx,
quadratic);
I->clamp = BI_CLAMP_CLAMP_M1_1;
/* f(x) + e f'(x) - (e^2/2) f''(x) */
bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx);
}
/*
* The XOR lane op is useful for derivative calculations, but not all Bifrost
* implementations have it. Add a safe helper that uses the hardware
* functionality when available and lowers where unavailable.
*/
static bi_index
bi_clper_xor(bi_builder *b, bi_index s0, bi_index s1)
{
if (!(b->shader->quirks & BIFROST_LIMITED_CLPER)) {
return bi_clper_i32(b, s0, s1,
BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_XOR,
BI_SUBGROUP_SUBGROUP4);
}
bi_index lane_id = bi_fau(BIR_FAU_LANE_ID, false);
bi_index lane = bi_lshift_xor_i32(b, lane_id, s1, bi_imm_u8(0));
return bi_clper_old_i32(b, s0, lane);
}
static enum bi_cmpf
bi_translate_cmpf(nir_op op)
{
switch (op) {
case nir_op_ieq8:
case nir_op_ieq16:
case nir_op_ieq32:
case nir_op_feq16:
case nir_op_feq32:
return BI_CMPF_EQ;
case nir_op_ine8:
case nir_op_ine16:
case nir_op_ine32:
case nir_op_fneu16:
case nir_op_fneu32:
return BI_CMPF_NE;
case nir_op_ilt8:
case nir_op_ilt16:
case nir_op_ilt32:
case nir_op_flt16:
case nir_op_flt32:
case nir_op_ult8:
case nir_op_ult16:
case nir_op_ult32:
return BI_CMPF_LT;
case nir_op_ige8:
case nir_op_ige16:
case nir_op_ige32:
case nir_op_fge16:
case nir_op_fge32:
case nir_op_uge8:
case nir_op_uge16:
case nir_op_uge32:
return BI_CMPF_GE;
default:
unreachable("invalid comparison");
}
}
static bool
bi_nir_is_replicated(nir_alu_src *src)
{
for (unsigned i = 1; i < nir_src_num_components(src->src); ++i) {
if (src->swizzle[0] == src->swizzle[i])
return false;
}
return true;
}
static void
bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
{
bi_index dst = bi_dest_index(&instr->dest.dest);
unsigned srcs = nir_op_infos[instr->op].num_inputs;
unsigned sz = nir_dest_bit_size(instr->dest.dest);
unsigned comps = nir_dest_num_components(instr->dest.dest);
unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0;
/* Indicate scalarness */
if (sz == 16 && comps == 1)
dst.swizzle = BI_SWIZZLE_H00;
/* First, match against the various moves in NIR. These are
* special-cased because they can operate on vectors even after
* lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the
* instruction is no "bigger" than SIMD-within-a-register. These moves
* are the exceptions that need to handle swizzles specially. */
switch (instr->op) {
case nir_op_vec2:
case nir_op_vec3:
case nir_op_vec4:
case nir_op_vec8:
case nir_op_vec16: {
bi_index unoffset_srcs[16] = { bi_null() };
unsigned channels[16] = { 0 };
for (unsigned i = 0; i < srcs; ++i) {
unoffset_srcs[i] = bi_src_index(&instr->src[i].src);
channels[i] = instr->src[i].swizzle[0];
}
bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz);
return;
}
case nir_op_unpack_32_2x16: {
/* Should have been scalarized */
assert(comps == 2 && sz == 16);
bi_index vec = bi_src_index(&instr->src[0].src);
unsigned chan = instr->src[0].swizzle[0];
bi_mov_i32_to(b, dst, bi_extract(b, vec, chan));
return;
}
case nir_op_unpack_64_2x32_split_x:
{
unsigned chan = (instr->src[0].swizzle[0] * 2) + 0;
bi_mov_i32_to(b, dst, bi_extract(b, bi_src_index(&instr->src[0].src), chan));
return;
}
case nir_op_unpack_64_2x32_split_y:
{
unsigned chan = (instr->src[0].swizzle[0] * 2) + 1;
bi_mov_i32_to(b, dst, bi_extract(b, bi_src_index(&instr->src[0].src), chan));
return;
}
case nir_op_pack_64_2x32_split:
bi_collect_v2i32_to(b, dst,
bi_extract(b, bi_src_index(&instr->src[0].src), instr->src[0].swizzle[0]),
bi_extract(b, bi_src_index(&instr->src[1].src), instr->src[1].swizzle[0]));
return;
case nir_op_pack_64_2x32:
bi_collect_v2i32_to(b, dst,
bi_extract(b, bi_src_index(&instr->src[0].src), 0),
bi_extract(b, bi_src_index(&instr->src[0].src), 1));
return;
case nir_op_pack_uvec2_to_uint: {
bi_index src = bi_src_index(&instr->src[0].src);
assert(sz == 32 && src_sz == 32);
bi_mkvec_v2i16_to(b, dst, bi_half(bi_extract(b, src, 0), false),
bi_half(bi_extract(b, src, 1), false));
return;
}
case nir_op_pack_uvec4_to_uint: {
bi_index src = bi_src_index(&instr->src[0].src);
assert(sz == 32 && src_sz == 32);
bi_mkvec_v4i8_to(b, dst, bi_byte(bi_extract(b, src, 0), 0),
bi_byte(bi_extract(b, src, 1), 0),
bi_byte(bi_extract(b, src, 2), 0),
bi_byte(bi_extract(b, src, 3), 0));
return;
}
case nir_op_mov: {
bi_index idx = bi_src_index(&instr->src[0].src);
bi_index unoffset_srcs[4] = { idx, idx, idx, idx };
unsigned channels[4] = {
comps > 0 ? instr->src[0].swizzle[0] : 0,
comps > 1 ? instr->src[0].swizzle[1] : 0,
comps > 2 ? instr->src[0].swizzle[2] : 0,
comps > 3 ? instr->src[0].swizzle[3] : 0,
};
bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, src_sz);
return;
}
case nir_op_pack_32_2x16: {
assert(comps == 1);
bi_index idx = bi_src_index(&instr->src[0].src);
bi_index unoffset_srcs[4] = { idx, idx, idx, idx };
unsigned channels[2] = {
instr->src[0].swizzle[0],
instr->src[0].swizzle[1]
};
bi_make_vec_to(b, dst, unoffset_srcs, channels, 2, 16);
return;
}
case nir_op_f2f16:
case nir_op_f2f16_rtz:
case nir_op_f2f16_rtne: {
assert(src_sz == 32);
bi_index idx = bi_src_index(&instr->src[0].src);
bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
bi_index s1 = comps > 1 ?
bi_extract(b, idx, instr->src[0].swizzle[1]) : s0;
bi_instr *I = bi_v2f32_to_v2f16_to(b, dst, s0, s1);
/* Override rounding if explicitly requested. Otherwise, the
* default rounding mode is selected by the builder. Depending
* on the float controls required by the shader, the default
* mode may not be nearest-even.
*/
if (instr->op == nir_op_f2f16_rtz)
I->round = BI_ROUND_RTZ;
else if (instr->op == nir_op_f2f16_rtne)
I->round = BI_ROUND_NONE; /* Nearest even */
return;
}
/* Vectorized downcasts */
case nir_op_u2u16:
case nir_op_i2i16: {
if (!(src_sz == 32 && comps == 2))
break;
bi_index idx = bi_src_index(&instr->src[0].src);
bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
bi_index s1 = bi_extract(b, idx, instr->src[0].swizzle[1]);
bi_mkvec_v2i16_to(b, dst,
bi_half(s0, false), bi_half(s1, false));
return;
}
/* While we do not have a direct V2U32_TO_V2F16 instruction, lowering to
* MKVEC.v2i16 + V2U16_TO_V2F16 is more efficient on Bifrost than
* scalarizing due to scheduling (equal cost on Valhall). Additionally
* if the source is replicated the MKVEC.v2i16 can be optimized out.
*/
case nir_op_u2f16:
case nir_op_i2f16: {
if (!(src_sz == 32 && comps == 2))
break;
nir_alu_src *src = &instr->src[0];
bi_index idx = bi_src_index(&src->src);
bi_index s0 = bi_extract(b, idx, src->swizzle[0]);
bi_index s1 = bi_extract(b, idx, src->swizzle[1]);
bi_index t = (src->swizzle[0] == src->swizzle[1]) ?
bi_half(s0, false) :
bi_mkvec_v2i16(b, bi_half(s0, false),
bi_half(s1, false));
if (instr->op == nir_op_u2f16)
bi_v2u16_to_v2f16_to(b, dst, t);
else
bi_v2s16_to_v2f16_to(b, dst, t);
return;
}
case nir_op_i2i8:
case nir_op_u2u8:
{
/* Acts like an 8-bit swizzle */
bi_index idx = bi_src_index(&instr->src[0].src);
unsigned factor = src_sz / 8;
unsigned chan[4] = { 0 };
for (unsigned i = 0; i < comps; ++i)
chan[i] = instr->src[0].swizzle[i] * factor;
bi_make_vec_to(b, dst, &idx, chan, comps, 8);
return;
}
case nir_op_b32csel:
{
if (sz != 16)
break;
/* We allow vectorizing b32csel(cond, A, B) which can be
* translated as MUX.v2i16, even though cond is a 32-bit vector.
*
* If the source condition vector is replicated, we can use
* MUX.v2i16 directly, letting each component use the
* corresponding half of the 32-bit source. NIR uses 0/~0
* booleans so that's guaranteed to work (that is, 32-bit NIR
* booleans are 16-bit replicated).
*
* If we're not replicated, we use the same trick but must
* insert a MKVEC.v2i16 first to convert down to 16-bit.
*/
bi_index idx = bi_src_index(&instr->src[0].src);
bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
bi_index s1 = bi_alu_src_index(b, instr->src[1], comps);
bi_index s2 = bi_alu_src_index(b, instr->src[2], comps);
if (!bi_nir_is_replicated(&instr->src[0])) {
s0 = bi_mkvec_v2i16(b, bi_half(s0, false),
bi_half(bi_extract(b, idx, instr->src[0].swizzle[1]), false));
}
bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
return;
}
default:
break;
}
bi_index s0 = srcs > 0 ? bi_alu_src_index(b, instr->src[0], comps) : bi_null();
bi_index s1 = srcs > 1 ? bi_alu_src_index(b, instr->src[1], comps) : bi_null();
bi_index s2 = srcs > 2 ? bi_alu_src_index(b, instr->src[2], comps) : bi_null();
switch (instr->op) {
case nir_op_ffma:
bi_fma_to(b, sz, dst, s0, s1, s2);
break;
case nir_op_fmul:
bi_fma_to(b, sz, dst, s0, s1, bi_negzero());
break;
case nir_op_fsub:
s1 = bi_neg(s1);
FALLTHROUGH;
case nir_op_fadd:
bi_fadd_to(b, sz, dst, s0, s1);
break;
case nir_op_fsat: {
bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
I->clamp = BI_CLAMP_CLAMP_0_1;
break;
}
case nir_op_fsat_signed_mali: {
bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
I->clamp = BI_CLAMP_CLAMP_M1_1;
break;
}
case nir_op_fclamp_pos_mali: {
bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
I->clamp = BI_CLAMP_CLAMP_0_INF;
break;
}
case nir_op_fneg:
bi_fabsneg_to(b, sz, dst, bi_neg(s0));
break;
case nir_op_fabs:
bi_fabsneg_to(b, sz, dst, bi_abs(s0));
break;
case nir_op_fsin:
bi_lower_fsincos_32(b, dst, s0, false);
break;
case nir_op_fcos:
bi_lower_fsincos_32(b, dst, s0, true);
break;
case nir_op_fexp2:
assert(sz == 32); /* should've been lowered */
if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
bi_lower_fexp2_32(b, dst, s0);
else
bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f));
break;
case nir_op_flog2:
assert(sz == 32); /* should've been lowered */
if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
bi_lower_flog2_32(b, dst, s0);
else
bi_flog2_32(b, dst, s0);
break;
case nir_op_fpow:
assert(sz == 32); /* should've been lowered */
if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
bi_lower_fpow_32(b, dst, s0, s1);
else
bi_fpow_32(b, dst, s0, s1);
break;
case nir_op_frexp_exp:
bi_frexpe_to(b, sz, dst, s0, false, false);
break;
case nir_op_frexp_sig:
bi_frexpm_to(b, sz, dst, s0, false, false);
break;
case nir_op_ldexp:
bi_ldexp_to(b, sz, dst, s0, s1);
break;
case nir_op_b8csel:
bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
break;
case nir_op_b16csel:
bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
break;
case nir_op_b32csel:
bi_mux_i32_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
break;
case nir_op_extract_u8:
case nir_op_extract_i8: {
assert(comps == 1 && "should be scalarized");
assert((src_sz == 16 || src_sz == 32) && "should be lowered");
unsigned byte = nir_src_as_uint(instr->src[1].src);
if (s0.swizzle == BI_SWIZZLE_H11) {
assert(byte < 2);
byte += 2;
} else if (s0.swizzle != BI_SWIZZLE_H01) {
assert(s0.swizzle == BI_SWIZZLE_H00);
}
assert(byte < 4);
s0.swizzle = BI_SWIZZLE_H01;
if (instr->op == nir_op_extract_i8)
bi_s8_to_s32_to(b, dst, bi_byte(s0, byte));
else
bi_u8_to_u32_to(b, dst, bi_byte(s0, byte));
break;
}
case nir_op_extract_u16:
case nir_op_extract_i16: {
assert(comps == 1 && "should be scalarized");
assert(src_sz == 32 && "should be lowered");
unsigned half = nir_src_as_uint(instr->src[1].src);
assert(half == 0 || half == 1);
if (instr->op == nir_op_extract_i16)
bi_s16_to_s32_to(b, dst, bi_half(s0, half));
else
bi_u16_to_u32_to(b, dst, bi_half(s0, half));
break;
}
case nir_op_insert_u16: {
assert(comps == 1 && "should be scalarized");
unsigned half = nir_src_as_uint(instr->src[1].src);
assert(half == 0 || half == 1);
if (half == 0)
bi_u16_to_u32_to(b, dst, bi_half(s0, 0));
else
bi_mkvec_v2i16_to(b, dst, bi_imm_u16(0), bi_half(s0, 0));
break;
}
case nir_op_ishl:
bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));
break;
case nir_op_ushr:
bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), false);
break;
case nir_op_ishr:
if (b->shader->arch >= 9)
bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), true);
else
bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0));
break;
case nir_op_imin:
case nir_op_umin:
bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst,
s0, s1, s0, s1, BI_CMPF_LT);
break;
case nir_op_imax:
case nir_op_umax:
bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst,
s0, s1, s0, s1, BI_CMPF_GT);
break;
case nir_op_fddx_must_abs_mali:
case nir_op_fddy_must_abs_mali: {
bi_index bit = bi_imm_u32(instr->op == nir_op_fddx_must_abs_mali ? 1 : 2);
bi_index adjacent = bi_clper_xor(b, s0, bit);
bi_fadd_to(b, sz, dst, adjacent, bi_neg(s0));
break;
}
case nir_op_fddx:
case nir_op_fddy:
case nir_op_fddx_coarse:
case nir_op_fddy_coarse:
case nir_op_fddx_fine:
case nir_op_fddy_fine: {
unsigned axis;
switch (instr->op) {
case nir_op_fddx:
case nir_op_fddx_coarse:
case nir_op_fddx_fine:
axis = 1;
break;
case nir_op_fddy:
case nir_op_fddy_coarse:
case nir_op_fddy_fine:
axis = 2;
break;
default:
unreachable("Invalid derivative op");
}
bi_index lane1, lane2;
switch (instr->op) {
case nir_op_fddx:
case nir_op_fddx_fine:
case nir_op_fddy:
case nir_op_fddy_fine:
lane1 = bi_lshift_and_i32(b,
bi_fau(BIR_FAU_LANE_ID, false),
bi_imm_u32(0x3 & ~axis),
bi_imm_u8(0));
lane2 = bi_iadd_u32(b, lane1,
bi_imm_u32(axis),
false);
break;
case nir_op_fddx_coarse:
case nir_op_fddy_coarse:
lane1 = bi_imm_u32(0);
lane2 = bi_imm_u32(axis);
break;
default:
unreachable("Invalid derivative op");
}
bi_index left, right;
if (b->shader->quirks & BIFROST_LIMITED_CLPER) {
left = bi_clper_old_i32(b, s0, lane1);
right = bi_clper_old_i32(b, s0, lane2);
} else {
left = bi_clper_i32(b, s0, lane1,
BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE,
BI_SUBGROUP_SUBGROUP4);
right = bi_clper_i32(b, s0, lane2,
BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE,
BI_SUBGROUP_SUBGROUP4);
}
bi_fadd_to(b, sz, dst, right, bi_neg(left));
break;
}
case nir_op_f2f32:
bi_f16_to_f32_to(b, dst, s0);
break;
case nir_op_fquantize2f16:
{
bi_instr *f16 = bi_v2f32_to_v2f16_to(b, bi_temp(b->shader), s0, s0);
bi_instr *f32 = bi_f16_to_f32_to(b, dst, bi_half(f16->dest[0], false));
f16->ftz = f32->ftz = true;
break;
}
case nir_op_f2i32:
if (src_sz == 32)
bi_f32_to_s32_to(b, dst, s0);
else
bi_f16_to_s32_to(b, dst, s0);
break;
/* Note 32-bit sources => no vectorization, so 32-bit works */
case nir_op_f2u16:
if (src_sz == 32)
bi_f32_to_u32_to(b, dst, s0);
else
bi_v2f16_to_v2u16_to(b, dst, s0);
break;
case nir_op_f2i16:
if (src_sz == 32)
bi_f32_to_s32_to(b, dst, s0);
else
bi_v2f16_to_v2s16_to(b, dst, s0);
break;
case nir_op_f2u32:
if (src_sz == 32)
bi_f32_to_u32_to(b, dst, s0);
else
bi_f16_to_u32_to(b, dst, s0);
break;