src/amd/common/ac_nir_lower_esgs_io_to_mem.c - third_party/mesa - Git at Google

 /*
  * Copyright © 2021 Valve Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  *
  */

 #include "ac_nir.h"
 #include "nir_builder.h"

 /*
  * Lower NIR cross-stage I/O intrinsics into the memory accesses that actually happen on the HW.
  *
  * These HW stages are used only when a Geometry Shader is used.
  * Export Shader (ES) runs the SW stage before GS, can be either VS or TES.
  *
  * * GFX6-8:
  *   ES and GS are separate HW stages.
  *   I/O is passed between them through VRAM.
  * * GFX9+:
  *   ES and GS are merged into a single HW stage.
  *   I/O is passed between them through LDS.
  *
  */

 typedef struct {
    /* Which hardware generation we're dealing with */
    enum chip_class chip_class;

    /* Number of ES outputs for which memory should be reserved.
     * When compacted, this should be the number of linked ES outputs.
     */
    unsigned num_reserved_es_outputs;
 } lower_esgs_io_state;

 static nir_ssa_def *
 emit_split_buffer_load(nir_builder *b, nir_ssa_def *desc, nir_ssa_def *v_off, nir_ssa_def *s_off,
                        unsigned component_stride, unsigned num_components, unsigned bit_size)
 {
    unsigned total_bytes = num_components * bit_size / 8u;
    unsigned full_dwords = total_bytes / 4u;
    unsigned remaining_bytes = total_bytes - full_dwords * 4u;

    /* Accomodate max number of split 64-bit loads */
    nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS * 2u];

    /* Assume that 1x32-bit load is better than 1x16-bit + 1x8-bit */
    if (remaining_bytes == 3) {
       remaining_bytes = 0;
       full_dwords++;
    }

    for (unsigned i = 0; i < full_dwords; ++i)
       comps[i] = nir_build_load_buffer_amd(b, 1, 32, desc, v_off, s_off,
                                            .base = component_stride * i, .memory_modes = nir_var_shader_in);

    if (remaining_bytes)
       comps[full_dwords] = nir_build_load_buffer_amd(b, 1, remaining_bytes * 8, desc, v_off, s_off,
                                                      .base = component_stride * full_dwords, .memory_modes = nir_var_shader_in);

    return nir_extract_bits(b, comps, full_dwords + !!remaining_bytes, 0, num_components, bit_size);
 }

 static void
 emit_split_buffer_store(nir_builder *b, nir_ssa_def *d, nir_ssa_def *desc, nir_ssa_def *v_off, nir_ssa_def *s_off,
                         unsigned component_stride, unsigned num_components, unsigned bit_size,
                         unsigned writemask, bool swizzled, bool slc)
 {
    while (writemask) {
       int start, count;
       u_bit_scan_consecutive_range(&writemask, &start, &count);
       assert(start >= 0 && count >= 0);

       unsigned bytes = count * bit_size / 8u;
       unsigned start_byte = start * bit_size / 8u;

       while (bytes) {
          unsigned store_bytes = MIN2(bytes, 4u);
          if ((start_byte % 4) == 1 || (start_byte % 4) == 3)
             store_bytes = MIN2(store_bytes, 1);
          else if ((start_byte % 4) == 2)
             store_bytes = MIN2(store_bytes, 2);

          nir_ssa_def *store_val = nir_extract_bits(b, &d, 1, start_byte * 8u, 1, store_bytes * 8u);
          nir_build_store_buffer_amd(b, store_val, desc, v_off, s_off, .is_swizzled = swizzled, .slc_amd = slc,
                                     .base = start_byte, .write_mask = 1u, .memory_modes = nir_var_shader_out);

          start_byte += store_bytes;
          bytes -= store_bytes;
       }
    }
 }

 static bool
 lower_es_output_store(nir_builder *b,
                       nir_instr *instr,
                       void *state)
 {
    if (instr->type != nir_instr_type_intrinsic)
       return false;

    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);

    if (intrin->intrinsic != nir_intrinsic_store_output)
       return false;

    lower_esgs_io_state *st = (lower_esgs_io_state *) state;
    unsigned write_mask = nir_intrinsic_write_mask(intrin);

    b->cursor = nir_before_instr(instr);
    nir_ssa_def *io_off = nir_build_calc_io_offset(b, intrin, nir_imm_int(b, 16u), 4u);

    if (st->chip_class <= GFX8) {
       /* GFX6-8: ES is a separate HW stage, data is passed from ES to GS in VRAM. */
       nir_ssa_def *ring = nir_build_load_ring_esgs_amd(b);
       nir_ssa_def *es2gs_off = nir_build_load_ring_es2gs_offset_amd(b);
       emit_split_buffer_store(b, intrin->src[0].ssa, ring, io_off, es2gs_off, 4u,
                               intrin->src[0].ssa->num_components, intrin->src[0].ssa->bit_size,
                               write_mask, true, true);
    } else {
       /* GFX9+: ES is merged into GS, data is passed through LDS. */
       unsigned esgs_itemsize = st->num_reserved_es_outputs * 16u;
       nir_ssa_def *vertex_idx = nir_build_load_local_invocation_index(b);
       nir_ssa_def *off = nir_iadd(b, nir_imul_imm(b, vertex_idx, esgs_itemsize), io_off);
       nir_build_store_shared(b, intrin->src[0].ssa, off, .write_mask = write_mask,
                              .align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
    }

    nir_instr_remove(instr);
    return true;
 }

 static nir_ssa_def *
 gs_per_vertex_input_vertex_offset_gfx6(nir_builder *b, nir_src *vertex_src)
 {
    if (nir_src_is_const(*vertex_src))
       return nir_build_load_gs_vertex_offset_amd(b, .base = nir_src_as_uint(*vertex_src));

    nir_ssa_def *vertex_offset = nir_build_load_gs_vertex_offset_amd(b, .base = 0);

    for (unsigned i = 1; i < b->shader->info.gs.vertices_in; ++i) {
       nir_ssa_def *cond = nir_ieq_imm(b, vertex_src->ssa, i);
       nir_ssa_def *elem = nir_build_load_gs_vertex_offset_amd(b, .base = i);
       vertex_offset = nir_bcsel(b, cond, elem, vertex_offset);
    }

    return vertex_offset;
 }

 static nir_ssa_def *
 gs_per_vertex_input_vertex_offset_gfx9(nir_builder *b, nir_src *vertex_src)
 {
    if (nir_src_is_const(*vertex_src)) {
       unsigned vertex = nir_src_as_uint(*vertex_src);
       return nir_ubfe(b, nir_build_load_gs_vertex_offset_amd(b, .base = vertex / 2u * 2u),
                       nir_imm_int(b, (vertex % 2u) * 16u), nir_imm_int(b, 16u));
    }

    nir_ssa_def *vertex_offset = nir_build_load_gs_vertex_offset_amd(b, .base = 0);

    for (unsigned i = 1; i < b->shader->info.gs.vertices_in; i++) {
       nir_ssa_def *cond = nir_ieq_imm(b, vertex_src->ssa, i);
       nir_ssa_def *elem = nir_build_load_gs_vertex_offset_amd(b, .base = i / 2u * 2u);
       if (i % 2u)
          elem = nir_ishr_imm(b, elem, 16u);

       vertex_offset = nir_bcsel(b, cond, elem, vertex_offset);
    }

    return nir_iand_imm(b, vertex_offset, 0xffffu);
 }

 static nir_ssa_def *
 gs_per_vertex_input_offset(nir_builder *b,
                            lower_esgs_io_state *st,
                            nir_intrinsic_instr *instr)
 {
    nir_src *vertex_src = nir_get_io_vertex_index_src(instr);
    nir_ssa_def *vertex_offset = st->chip_class >= GFX9
                                 ? gs_per_vertex_input_vertex_offset_gfx9(b, vertex_src)
                                 : gs_per_vertex_input_vertex_offset_gfx6(b, vertex_src);

    unsigned base_stride = st->chip_class >= GFX9 ? 1 : 64 /* Wave size on GFX6-8 */;
    nir_ssa_def *io_off = nir_build_calc_io_offset(b, instr, nir_imm_int(b, base_stride * 4u), base_stride);
    nir_ssa_def *off = nir_iadd(b, io_off, vertex_offset);
    return nir_imul_imm(b, off, 4u);
 }

 static nir_ssa_def *
 lower_gs_per_vertex_input_load(nir_builder *b,
                                nir_instr *instr,
                                void *state)
 {
    lower_esgs_io_state *st = (lower_esgs_io_state *) state;
    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
    nir_ssa_def *off = gs_per_vertex_input_offset(b, st, intrin);

    if (st->chip_class >= GFX9)
       return nir_build_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off,
                                    .align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);

    unsigned wave_size = 64u; /* GFX6-8 only support wave64 */
    nir_ssa_def *ring = nir_build_load_ring_esgs_amd(b);
    return emit_split_buffer_load(b, ring, off, nir_imm_zero(b, 1, 32), 4u * wave_size,
                                  intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size);
 }

 static bool
 filter_load_per_vertex_input(const nir_instr *instr, UNUSED const void *state)
 {
    return instr->type == nir_instr_type_intrinsic && nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_per_vertex_input;
 }

 void
 ac_nir_lower_es_outputs_to_mem(nir_shader *shader,
                                enum chip_class chip_class,
                                unsigned num_reserved_es_outputs)
 {
    lower_esgs_io_state state = {
       .chip_class = chip_class,
       .num_reserved_es_outputs = num_reserved_es_outputs,
    };

    nir_shader_instructions_pass(shader,
                                 lower_es_output_store,
                                 nir_metadata_block_index | nir_metadata_dominance,
                                 &state);
 }

 void
 ac_nir_lower_gs_inputs_to_mem(nir_shader *shader,
                               enum chip_class chip_class,
                               unsigned num_reserved_es_outputs)
 {
    lower_esgs_io_state state = {
       .chip_class = chip_class,
       .num_reserved_es_outputs = num_reserved_es_outputs,
    };

    nir_shader_lower_instructions(shader,
                                  filter_load_per_vertex_input,
                                  lower_gs_per_vertex_input_load,
                                  &state);
 }
	/*
	* Copyright © 2021 Valve Corporation
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
	* IN THE SOFTWARE.
	*
	*/

	#include "ac_nir.h"
	#include "nir_builder.h"

	/*
	* Lower NIR cross-stage I/O intrinsics into the memory accesses that actually happen on the HW.
	*
	* These HW stages are used only when a Geometry Shader is used.
	* Export Shader (ES) runs the SW stage before GS, can be either VS or TES.
	*
	* * GFX6-8:
	* ES and GS are separate HW stages.
	* I/O is passed between them through VRAM.
	* * GFX9+:
	* ES and GS are merged into a single HW stage.
	* I/O is passed between them through LDS.
	*
	*/

	typedef struct {
	/* Which hardware generation we're dealing with */
	enum chip_class chip_class;

	/* Number of ES outputs for which memory should be reserved.
	* When compacted, this should be the number of linked ES outputs.
	*/
	unsigned num_reserved_es_outputs;
	} lower_esgs_io_state;

	static nir_ssa_def *
	emit_split_buffer_load(nir_builder b, nir_ssa_def desc, nir_ssa_def v_off, nir_ssa_def s_off,
	unsigned component_stride, unsigned num_components, unsigned bit_size)
	{
	unsigned total_bytes = num_components * bit_size / 8u;
	unsigned full_dwords = total_bytes / 4u;
	unsigned remaining_bytes = total_bytes - full_dwords * 4u;

	/* Accomodate max number of split 64-bit loads */
	nir_ssa_def comps[NIR_MAX_VEC_COMPONENTS 2u];

	/* Assume that 1x32-bit load is better than 1x16-bit + 1x8-bit */
	if (remaining_bytes == 3) {
	remaining_bytes = 0;
	full_dwords++;
	}

	for (unsigned i = 0; i < full_dwords; ++i)
	comps[i] = nir_build_load_buffer_amd(b, 1, 32, desc, v_off, s_off,
	.base = component_stride * i, .memory_modes = nir_var_shader_in);

	if (remaining_bytes)
	comps[full_dwords] = nir_build_load_buffer_amd(b, 1, remaining_bytes * 8, desc, v_off, s_off,
	.base = component_stride * full_dwords, .memory_modes = nir_var_shader_in);

	return nir_extract_bits(b, comps, full_dwords + !!remaining_bytes, 0, num_components, bit_size);
	}

	static void
	emit_split_buffer_store(nir_builder b, nir_ssa_def d, nir_ssa_def desc, nir_ssa_def v_off, nir_ssa_def *s_off,
	unsigned component_stride, unsigned num_components, unsigned bit_size,
	unsigned writemask, bool swizzled, bool slc)
	{
	while (writemask) {
	int start, count;
	u_bit_scan_consecutive_range(&writemask, &start, &count);
	assert(start >= 0 && count >= 0);

	unsigned bytes = count * bit_size / 8u;
	unsigned start_byte = start * bit_size / 8u;

	while (bytes) {
	unsigned store_bytes = MIN2(bytes, 4u);
	if ((start_byte % 4) == 1 \|\| (start_byte % 4) == 3)
	store_bytes = MIN2(store_bytes, 1);
	else if ((start_byte % 4) == 2)
	store_bytes = MIN2(store_bytes, 2);

	nir_ssa_def store_val = nir_extract_bits(b, &d, 1, start_byte 8u, 1, store_bytes * 8u);
	nir_build_store_buffer_amd(b, store_val, desc, v_off, s_off, .is_swizzled = swizzled, .slc_amd = slc,
	.base = start_byte, .write_mask = 1u, .memory_modes = nir_var_shader_out);

	start_byte += store_bytes;
	bytes -= store_bytes;
	}
	}
	}

	static bool
	lower_es_output_store(nir_builder *b,
	nir_instr *instr,
	void *state)
	{
	if (instr->type != nir_instr_type_intrinsic)
	return false;

	nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);

	if (intrin->intrinsic != nir_intrinsic_store_output)
	return false;

	lower_esgs_io_state st = (lower_esgs_io_state ) state;
	unsigned write_mask = nir_intrinsic_write_mask(intrin);

	b->cursor = nir_before_instr(instr);
	nir_ssa_def *io_off = nir_build_calc_io_offset(b, intrin, nir_imm_int(b, 16u), 4u);

	if (st->chip_class <= GFX8) {
	/* GFX6-8: ES is a separate HW stage, data is passed from ES to GS in VRAM. */
	nir_ssa_def *ring = nir_build_load_ring_esgs_amd(b);
	nir_ssa_def *es2gs_off = nir_build_load_ring_es2gs_offset_amd(b);
	emit_split_buffer_store(b, intrin->src[0].ssa, ring, io_off, es2gs_off, 4u,
	intrin->src[0].ssa->num_components, intrin->src[0].ssa->bit_size,
	write_mask, true, true);
	} else {
	/* GFX9+: ES is merged into GS, data is passed through LDS. */
	unsigned esgs_itemsize = st->num_reserved_es_outputs * 16u;
	nir_ssa_def *vertex_idx = nir_build_load_local_invocation_index(b);
	nir_ssa_def *off = nir_iadd(b, nir_imul_imm(b, vertex_idx, esgs_itemsize), io_off);
	nir_build_store_shared(b, intrin->src[0].ssa, off, .write_mask = write_mask,
	.align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
	}

	nir_instr_remove(instr);
	return true;
	}

	static nir_ssa_def *
	gs_per_vertex_input_vertex_offset_gfx6(nir_builder b, nir_src vertex_src)
	{
	if (nir_src_is_const(*vertex_src))
	return nir_build_load_gs_vertex_offset_amd(b, .base = nir_src_as_uint(*vertex_src));

	nir_ssa_def *vertex_offset = nir_build_load_gs_vertex_offset_amd(b, .base = 0);

	for (unsigned i = 1; i < b->shader->info.gs.vertices_in; ++i) {
	nir_ssa_def *cond = nir_ieq_imm(b, vertex_src->ssa, i);
	nir_ssa_def *elem = nir_build_load_gs_vertex_offset_amd(b, .base = i);
	vertex_offset = nir_bcsel(b, cond, elem, vertex_offset);
	}

	return vertex_offset;
	}

	static nir_ssa_def *
	gs_per_vertex_input_vertex_offset_gfx9(nir_builder b, nir_src vertex_src)
	{
	if (nir_src_is_const(*vertex_src)) {
	unsigned vertex = nir_src_as_uint(*vertex_src);
	return nir_ubfe(b, nir_build_load_gs_vertex_offset_amd(b, .base = vertex / 2u * 2u),
	nir_imm_int(b, (vertex % 2u) * 16u), nir_imm_int(b, 16u));
	}

	nir_ssa_def *vertex_offset = nir_build_load_gs_vertex_offset_amd(b, .base = 0);

	for (unsigned i = 1; i < b->shader->info.gs.vertices_in; i++) {
	nir_ssa_def *cond = nir_ieq_imm(b, vertex_src->ssa, i);
	nir_ssa_def elem = nir_build_load_gs_vertex_offset_amd(b, .base = i / 2u 2u);
	if (i % 2u)
	elem = nir_ishr_imm(b, elem, 16u);

	vertex_offset = nir_bcsel(b, cond, elem, vertex_offset);
	}

	return nir_iand_imm(b, vertex_offset, 0xffffu);
	}

	static nir_ssa_def *
	gs_per_vertex_input_offset(nir_builder *b,
	lower_esgs_io_state *st,
	nir_intrinsic_instr *instr)
	{
	nir_src *vertex_src = nir_get_io_vertex_index_src(instr);
	nir_ssa_def *vertex_offset = st->chip_class >= GFX9
	? gs_per_vertex_input_vertex_offset_gfx9(b, vertex_src)
	: gs_per_vertex_input_vertex_offset_gfx6(b, vertex_src);

	unsigned base_stride = st->chip_class >= GFX9 ? 1 : 64 /* Wave size on GFX6-8 */;
	nir_ssa_def io_off = nir_build_calc_io_offset(b, instr, nir_imm_int(b, base_stride 4u), base_stride);
	nir_ssa_def *off = nir_iadd(b, io_off, vertex_offset);
	return nir_imul_imm(b, off, 4u);
	}

	static nir_ssa_def *
	lower_gs_per_vertex_input_load(nir_builder *b,
	nir_instr *instr,
	void *state)
	{
	lower_esgs_io_state st = (lower_esgs_io_state ) state;
	nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
	nir_ssa_def *off = gs_per_vertex_input_offset(b, st, intrin);

	if (st->chip_class >= GFX9)
	return nir_build_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off,
	.align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);

	unsigned wave_size = 64u; /* GFX6-8 only support wave64 */
	nir_ssa_def *ring = nir_build_load_ring_esgs_amd(b);
	return emit_split_buffer_load(b, ring, off, nir_imm_zero(b, 1, 32), 4u * wave_size,
	intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size);
	}

	static bool
	filter_load_per_vertex_input(const nir_instr instr, UNUSED const void state)
	{
	return instr->type == nir_instr_type_intrinsic && nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_per_vertex_input;
	}

	void
	ac_nir_lower_es_outputs_to_mem(nir_shader *shader,
	enum chip_class chip_class,
	unsigned num_reserved_es_outputs)
	{
	lower_esgs_io_state state = {
	.chip_class = chip_class,
	.num_reserved_es_outputs = num_reserved_es_outputs,
	};

	nir_shader_instructions_pass(shader,
	lower_es_output_store,
	nir_metadata_block_index \| nir_metadata_dominance,
	&state);
	}

	void
	ac_nir_lower_gs_inputs_to_mem(nir_shader *shader,
	enum chip_class chip_class,
	unsigned num_reserved_es_outputs)
	{
	lower_esgs_io_state state = {
	.chip_class = chip_class,
	.num_reserved_es_outputs = num_reserved_es_outputs,
	};

	nir_shader_lower_instructions(shader,
	filter_load_per_vertex_input,
	lower_gs_per_vertex_input_load,
	&state);
	}