src/compiler/nir/nir_lower_io_indirect_loads.c - third_party/mesa - Git at Google

 /*
  * Copyright 2025 Advanced Micro Devices, Inc.
  * SPDX-License-Identifier: MIT
  */

 /* This pass expects IO intrinsics (load_input/load_output/...) and lowers
  * loads with indirect slot indexing to temp indexing. Supported cases:
  * - VS, TCS, TES, GS, FS inputs
  * - TCS outputs
  *
  * Indirect loads are replaced with direct loads whose results are stored in
  * a temp array, and the original load is replaced with an indirect load from
  * the temp array.
  *
  * Direct loads are typically inserted at the beginning of the shader, and only
  * the temp array loads are inserted at the place of the original indirect
  * load.
  *
  * The exceptions are TCS output loads and interpolate_at_* loads where
  * the direct loads are always inserted at the place of the original indirect
  * load.
  */

 #include "nir.h"
 #include "nir_builder.h"
 #include <c99_alloca.h>

 typedef struct {
    nir_variable *var[INTERP_MODE_COUNT];
    nir_variable *centroid[INTERP_MODE_COUNT];
    nir_variable *sample[INTERP_MODE_COUNT];
    unsigned first_comp;
    unsigned last_comp;
    bool declared;
 } var_info;

 typedef struct {
    nir_variable_mode modes;

    /* If arrays are loaded only once at the beginning, these are the local
     * variables.
     */
    var_info input[NUM_TOTAL_VARYING_SLOTS];
    var_info input_hi[NUM_TOTAL_VARYING_SLOTS];
    var_info output[NUM_TOTAL_VARYING_SLOTS];
    var_info output_hi[NUM_TOTAL_VARYING_SLOTS];
 } lower_io_indir_loads_state;

 static bool
 is_compact(nir_shader *nir, nir_intrinsic_instr *intr)
 {
    if (!nir->options->compact_arrays ||
        (nir->info.stage == MESA_SHADER_VERTEX && !nir_is_output_load(intr)))
       return false;

    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
    return (sem.location >= VARYING_SLOT_CLIP_DIST0 &&
            sem.location <= VARYING_SLOT_CULL_DIST1) ||
           (nir->info.stage == MESA_SHADER_TESS_CTRL &&
           (sem.location == VARYING_SLOT_TESS_LEVEL_INNER ||
            sem.location == VARYING_SLOT_TESS_LEVEL_OUTER));
 }

 static var_info *
 get_load_var(nir_intrinsic_instr *intr, lower_io_indir_loads_state *state)
 {
    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
    bool is_output = nir_is_output_load(intr);
    bool high = sem.high_dvec2 || sem.high_16bits;

    return is_output ? (high ? &state->output_hi[sem.location] :
                               &state->output[sem.location]) :
                       (high ? &state->input_hi[sem.location] :
                               &state->input[sem.location]);
 }

 static nir_variable **
 get_load_once_variable(gl_shader_stage stage, nir_intrinsic_instr *intr,
                        lower_io_indir_loads_state *state)
 {
    if (intr->intrinsic == nir_intrinsic_load_interpolated_input) {
       /* FS input arrays are reloaded at every interpolate_at_offset/at_sample
        * because we assume that the same inputs can also be interpolated at
        * pixel/centroid.
        */
       nir_intrinsic_instr *baryc = nir_src_as_intrinsic(intr->src[0]);
       if (!baryc)
          return false;

       enum glsl_interp_mode interp = nir_intrinsic_interp_mode(baryc);

       switch (baryc->intrinsic) {
       case nir_intrinsic_load_barycentric_pixel:
          return &get_load_var(intr, state)->var[interp];
       case nir_intrinsic_load_barycentric_centroid:
          return &get_load_var(intr, state)->centroid[interp];
       case nir_intrinsic_load_barycentric_sample:
          return &get_load_var(intr, state)->sample[interp];
       default:
          return NULL;
       }
    } else if (intr->intrinsic == nir_intrinsic_load_per_vertex_input ||
               intr->intrinsic == nir_intrinsic_load_input_vertex) {
       /* The per-vertex input is loaded at the beginning (not in place) only
        * if the vertex index is constant.
        */
       if (!nir_scalar_is_const(nir_scalar_resolved(intr->src[0].ssa, 0)))
          return NULL;
    } else if (stage == MESA_SHADER_TESS_CTRL && nir_is_output_load(intr)) {
       /* TCS output arrays are reloaded at every indirect load. */
       return NULL;
    }

    /* Other inputs can be loaded at the beginning. */
    return &get_load_var(intr, state)->var[0];
 }

 static bool
 gather_indirect_inputs(struct nir_builder *b, nir_intrinsic_instr *intr,
                        void *data)
 {
    if (!nir_intrinsic_has_io_semantics(intr) ||
        !nir_intrinsic_infos[intr->intrinsic].has_dest ||
        nir_is_output_load(intr))
       return false;

    lower_io_indir_loads_state *state = (lower_io_indir_loads_state *)data;
    unsigned component = nir_intrinsic_component(intr);

    var_info *var = get_load_var(intr, state);
    unsigned first_comp = component + nir_def_first_component_read(&intr->def);
    unsigned last_comp = component + nir_def_last_component_read(&intr->def);

    if (!var->declared) {
       var->declared = true;
       var->first_comp = first_comp;
       var->last_comp = last_comp;
    } else {
       var->first_comp = MIN2(var->first_comp, first_comp);
       var->last_comp = MAX2(var->last_comp, last_comp);
    }
    return false;
 }

 static bool
 lower_load(struct nir_builder *b, nir_intrinsic_instr *intr, void *data)
 {
    if (!nir_intrinsic_has_io_semantics(intr) ||
        !nir_intrinsic_infos[intr->intrinsic].has_dest)
       return false;

    lower_io_indir_loads_state *state = (lower_io_indir_loads_state *)data;
    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
    bool is_output = nir_is_output_load(intr);
    bool compact = is_compact(b->shader, intr);
    bool progress = false;

    /* Lower indirect loads. This works for all loads (input, output,
     * interpolated) because we don't care what the load is. We just duplicate
     * it for each slot and set the offset/location.
     */
    if (state->modes & (is_output ? nir_var_shader_out : nir_var_shader_in)) {
       nir_scalar array_index =
          nir_scalar_resolved(nir_get_io_offset_src(intr)->ssa, 0);

       if (!nir_scalar_is_const(array_index)) {
          nir_variable *temp = NULL;
          nir_variable **array =
             get_load_once_variable(b->shader->info.stage, intr, state);
          unsigned first_comp, last_comp;
          unsigned intr_component = nir_intrinsic_component(intr);

          /* Load the array components that are actually used. If we load
           * them at the beginning, load all that are read anywhere
           * in the shader.
           */
          if (array) {
             var_info *var = get_load_var(intr, state);
             assert(var->declared);
             first_comp = var->first_comp;
             last_comp = var->last_comp;
          } else {
             first_comp = intr_component + nir_def_first_component_read(&intr->def);
             last_comp = intr_component + nir_def_last_component_read(&intr->def);
          }

          /* Some arrays are loaded only once at the beginning of the shader,
           * others are loaded at every indirect load (such as TCS output loads).
           */
          if (!array || !*array) {
             nir_def **elems = alloca(sizeof(nir_def*) * sem.num_slots);
             unsigned num_comp = last_comp - first_comp + 1;

             if (array)
                b->cursor = nir_before_impl(b->impl);
             else
                b->cursor = nir_before_instr(&intr->instr);

             nir_def *zero = nir_imm_int(b, 0);
             nir_def *src0 = NULL;

             /* Load the barycentrics if we are loading inputs at the beginning. */
             if (array &&
                 intr->intrinsic == nir_intrinsic_load_interpolated_input) {
                nir_intrinsic_instr *baryc_intr = nir_src_as_intrinsic(intr->src[0]);

                src0 = nir_load_barycentric(b, baryc_intr->intrinsic,
                                             nir_intrinsic_interp_mode(baryc_intr));
             }

             /* Load the vertex index at the beginning if it's constant. */
             if (array &&
                 (intr->intrinsic == nir_intrinsic_load_per_vertex_input ||
                  intr->intrinsic == nir_intrinsic_load_input_vertex)) {
                nir_scalar s = nir_scalar_resolved(intr->src[0].ssa, 0);
                assert(nir_scalar_is_const(s));
                src0 = nir_imm_int(b, nir_scalar_as_uint(s));
             }

             /* Load individual array elements. */
             for (unsigned i = 0; i < sem.num_slots; i++) {
                /* Create a new load for each slot. */
                nir_intrinsic_instr *new_load =
                   nir_intrinsic_instr_create(b->shader, intr->intrinsic);
                new_load->num_components = num_comp;
                nir_def_init(&new_load->instr, &new_load->def,
                             num_comp, intr->def.bit_size);
                nir_intrinsic_copy_const_indices(new_load, intr);

                /* Set the same srcs .. */
                for (unsigned src = 0;
                     src < nir_intrinsic_infos[intr->intrinsic].num_srcs; src++)
                   new_load->src[src] = nir_src_for_ssa(intr->src[src].ssa);

                /* .. but change the indirect index to 0. */
                new_load->src[nir_get_io_offset_src_number(intr)] =
                   nir_src_for_ssa(zero);

                nir_intrinsic_set_component(new_load, first_comp);

                /* Set barycentrics or the vertex index if we are loading inputs
                 * at the beginning.
                 */
                if (src0)
                   new_load->src[0] = nir_src_for_ssa(src0);

                /* and set IO semantics to the location of the array element. */
                nir_io_semantics new_sem = sem;
                new_sem.num_slots = 1;

                if (!compact) {
                   new_sem.location += i;
                   nir_intrinsic_set_base(new_load, nir_intrinsic_base(intr) + i);
                } else {
                   new_sem.location += i / 4;
                   nir_intrinsic_set_component(new_load, i % 4);
                   nir_intrinsic_set_base(new_load, nir_intrinsic_base(intr) + i / 4);
                }

                nir_intrinsic_set_io_semantics(new_load, new_sem);

                nir_builder_instr_insert(b, &new_load->instr);
                elems[i] = &new_load->def;
             }

             /* Put the array elements into a local array variable. */
             enum glsl_base_type base_type =
                intr->def.bit_size == 16 ? GLSL_TYPE_FLOAT16 : GLSL_TYPE_FLOAT;
             const glsl_type *type =
                glsl_array_type(glsl_vector_type(base_type, num_comp),
                                sem.num_slots, 0);
             if (!array)
                array = &temp; /* loaded at the load instead of the beginning */
             *array = nir_local_variable_create(b->impl, type, "");

             /* Fill the array with the loaded elements. */
             for (unsigned i = 0; i < sem.num_slots; i++) {
                nir_store_array_var_imm(b, *array, i, elems[i],
                                        BITFIELD_MASK(num_comp));
             }
          }

          b->cursor = nir_before_instr(&intr->instr);

          /* Get the indirect value and upsize the vector to the original size. */
          nir_def *value = nir_load_array_var(b, *array, array_index.def);

          value = nir_shift_channels(b, value,
                                     (int)first_comp - (int)intr_component,
                                     intr->def.num_components);

          nir_def_replace(&intr->def, value);
          progress = true;
       }
    }

    return progress;
 }

 static bool
 lower_indirect_loads(nir_function_impl *impl, nir_variable_mode modes)
 {
    lower_io_indir_loads_state *state = calloc(1, sizeof(*state));
    state->modes = modes;

    if (modes & nir_var_shader_in) {
       nir_function_intrinsics_pass(impl, gather_indirect_inputs,
                                    nir_metadata_all, state);
    }

    bool progress = nir_function_intrinsics_pass(impl, lower_load,
                                                 nir_metadata_control_flow, state);
    free(state);
    return progress;
 }

 bool
 nir_lower_io_indirect_loads(nir_shader *nir, nir_variable_mode modes)
 {
    assert(modes & (nir_var_shader_in | nir_var_shader_out));
    assert(!(modes & ~(nir_var_shader_in | nir_var_shader_out)));
    assert((!(modes & nir_var_shader_out) ||
            nir->info.stage == MESA_SHADER_TESS_CTRL) &&
           "output lowering only supported for TCS");
    assert(nir->info.stage <= MESA_SHADER_FRAGMENT);

    bool progress = false;
    nir_foreach_function_impl(impl, nir) {
       progress |= lower_indirect_loads(impl, modes);
    }

    return progress;
 }
	/*
	* Copyright 2025 Advanced Micro Devices, Inc.
	* SPDX-License-Identifier: MIT
	*/

	/* This pass expects IO intrinsics (load_input/load_output/...) and lowers
	* loads with indirect slot indexing to temp indexing. Supported cases:
	* - VS, TCS, TES, GS, FS inputs
	* - TCS outputs
	*
	* Indirect loads are replaced with direct loads whose results are stored in
	* a temp array, and the original load is replaced with an indirect load from
	* the temp array.
	*
	* Direct loads are typically inserted at the beginning of the shader, and only
	* the temp array loads are inserted at the place of the original indirect
	* load.
	*
	* The exceptions are TCS output loads and interpolate_at_* loads where
	* the direct loads are always inserted at the place of the original indirect
	* load.
	*/

	#include "nir.h"
	#include "nir_builder.h"
	#include <c99_alloca.h>

	typedef struct {
	nir_variable *var[INTERP_MODE_COUNT];
	nir_variable *centroid[INTERP_MODE_COUNT];
	nir_variable *sample[INTERP_MODE_COUNT];
	unsigned first_comp;
	unsigned last_comp;
	bool declared;
	} var_info;

	typedef struct {
	nir_variable_mode modes;

	/* If arrays are loaded only once at the beginning, these are the local
	* variables.
	*/
	var_info input[NUM_TOTAL_VARYING_SLOTS];
	var_info input_hi[NUM_TOTAL_VARYING_SLOTS];
	var_info output[NUM_TOTAL_VARYING_SLOTS];
	var_info output_hi[NUM_TOTAL_VARYING_SLOTS];
	} lower_io_indir_loads_state;

	static bool
	is_compact(nir_shader nir, nir_intrinsic_instr intr)
	{
	if (!nir->options->compact_arrays \|\|
	(nir->info.stage == MESA_SHADER_VERTEX && !nir_is_output_load(intr)))
	return false;

	nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
	return (sem.location >= VARYING_SLOT_CLIP_DIST0 &&
	sem.location <= VARYING_SLOT_CULL_DIST1) \|\|
	(nir->info.stage == MESA_SHADER_TESS_CTRL &&
	(sem.location == VARYING_SLOT_TESS_LEVEL_INNER \|\|
	sem.location == VARYING_SLOT_TESS_LEVEL_OUTER));
	}

	static var_info *
	get_load_var(nir_intrinsic_instr intr, lower_io_indir_loads_state state)
	{
	nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
	bool is_output = nir_is_output_load(intr);
	bool high = sem.high_dvec2 \|\| sem.high_16bits;

	return is_output ? (high ? &state->output_hi[sem.location] :
	&state->output[sem.location]) :
	(high ? &state->input_hi[sem.location] :
	&state->input[sem.location]);
	}

	static nir_variable **
	get_load_once_variable(gl_shader_stage stage, nir_intrinsic_instr *intr,
	lower_io_indir_loads_state *state)
	{
	if (intr->intrinsic == nir_intrinsic_load_interpolated_input) {
	/* FS input arrays are reloaded at every interpolate_at_offset/at_sample
	* because we assume that the same inputs can also be interpolated at
	* pixel/centroid.
	*/
	nir_intrinsic_instr *baryc = nir_src_as_intrinsic(intr->src[0]);
	if (!baryc)
	return false;

	enum glsl_interp_mode interp = nir_intrinsic_interp_mode(baryc);

	switch (baryc->intrinsic) {
	case nir_intrinsic_load_barycentric_pixel:
	return &get_load_var(intr, state)->var[interp];
	case nir_intrinsic_load_barycentric_centroid:
	return &get_load_var(intr, state)->centroid[interp];
	case nir_intrinsic_load_barycentric_sample:
	return &get_load_var(intr, state)->sample[interp];
	default:
	return NULL;
	}
	} else if (intr->intrinsic == nir_intrinsic_load_per_vertex_input \|\|
	intr->intrinsic == nir_intrinsic_load_input_vertex) {
	/* The per-vertex input is loaded at the beginning (not in place) only
	* if the vertex index is constant.
	*/
	if (!nir_scalar_is_const(nir_scalar_resolved(intr->src[0].ssa, 0)))
	return NULL;
	} else if (stage == MESA_SHADER_TESS_CTRL && nir_is_output_load(intr)) {
	/* TCS output arrays are reloaded at every indirect load. */
	return NULL;
	}

	/* Other inputs can be loaded at the beginning. */
	return &get_load_var(intr, state)->var[0];
	}

	static bool
	gather_indirect_inputs(struct nir_builder b, nir_intrinsic_instr intr,
	void *data)
	{
	if (!nir_intrinsic_has_io_semantics(intr) \|\|
	!nir_intrinsic_infos[intr->intrinsic].has_dest \|\|
	nir_is_output_load(intr))
	return false;

	lower_io_indir_loads_state state = (lower_io_indir_loads_state )data;
	unsigned component = nir_intrinsic_component(intr);

	var_info *var = get_load_var(intr, state);
	unsigned first_comp = component + nir_def_first_component_read(&intr->def);
	unsigned last_comp = component + nir_def_last_component_read(&intr->def);

	if (!var->declared) {
	var->declared = true;
	var->first_comp = first_comp;
	var->last_comp = last_comp;
	} else {
	var->first_comp = MIN2(var->first_comp, first_comp);
	var->last_comp = MAX2(var->last_comp, last_comp);
	}
	return false;
	}

	static bool
	lower_load(struct nir_builder b, nir_intrinsic_instr intr, void *data)
	{
	if (!nir_intrinsic_has_io_semantics(intr) \|\|
	!nir_intrinsic_infos[intr->intrinsic].has_dest)
	return false;

	lower_io_indir_loads_state state = (lower_io_indir_loads_state )data;
	nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
	bool is_output = nir_is_output_load(intr);
	bool compact = is_compact(b->shader, intr);
	bool progress = false;

	/* Lower indirect loads. This works for all loads (input, output,
	* interpolated) because we don't care what the load is. We just duplicate
	* it for each slot and set the offset/location.
	*/
	if (state->modes & (is_output ? nir_var_shader_out : nir_var_shader_in)) {
	nir_scalar array_index =
	nir_scalar_resolved(nir_get_io_offset_src(intr)->ssa, 0);

	if (!nir_scalar_is_const(array_index)) {
	nir_variable *temp = NULL;
	nir_variable **array =
	get_load_once_variable(b->shader->info.stage, intr, state);
	unsigned first_comp, last_comp;
	unsigned intr_component = nir_intrinsic_component(intr);

	/* Load the array components that are actually used. If we load
	* them at the beginning, load all that are read anywhere
	* in the shader.
	*/
	if (array) {
	var_info *var = get_load_var(intr, state);
	assert(var->declared);
	first_comp = var->first_comp;
	last_comp = var->last_comp;
	} else {
	first_comp = intr_component + nir_def_first_component_read(&intr->def);
	last_comp = intr_component + nir_def_last_component_read(&intr->def);
	}

	/* Some arrays are loaded only once at the beginning of the shader,
	* others are loaded at every indirect load (such as TCS output loads).
	*/
	if (!array \|\| !*array) {
	nir_def *elems = alloca(sizeof(nir_def) * sem.num_slots);
	unsigned num_comp = last_comp - first_comp + 1;

	if (array)
	b->cursor = nir_before_impl(b->impl);
	else
	b->cursor = nir_before_instr(&intr->instr);

	nir_def *zero = nir_imm_int(b, 0);
	nir_def *src0 = NULL;

	/* Load the barycentrics if we are loading inputs at the beginning. */
	if (array &&
	intr->intrinsic == nir_intrinsic_load_interpolated_input) {
	nir_intrinsic_instr *baryc_intr = nir_src_as_intrinsic(intr->src[0]);

	src0 = nir_load_barycentric(b, baryc_intr->intrinsic,
	nir_intrinsic_interp_mode(baryc_intr));
	}

	/* Load the vertex index at the beginning if it's constant. */
	if (array &&
	(intr->intrinsic == nir_intrinsic_load_per_vertex_input \|\|
	intr->intrinsic == nir_intrinsic_load_input_vertex)) {
	nir_scalar s = nir_scalar_resolved(intr->src[0].ssa, 0);
	assert(nir_scalar_is_const(s));
	src0 = nir_imm_int(b, nir_scalar_as_uint(s));
	}

	/* Load individual array elements. */
	for (unsigned i = 0; i < sem.num_slots; i++) {
	/* Create a new load for each slot. */
	nir_intrinsic_instr *new_load =
	nir_intrinsic_instr_create(b->shader, intr->intrinsic);
	new_load->num_components = num_comp;
	nir_def_init(&new_load->instr, &new_load->def,
	num_comp, intr->def.bit_size);
	nir_intrinsic_copy_const_indices(new_load, intr);

	/* Set the same srcs .. */
	for (unsigned src = 0;
	src < nir_intrinsic_infos[intr->intrinsic].num_srcs; src++)
	new_load->src[src] = nir_src_for_ssa(intr->src[src].ssa);

	/* .. but change the indirect index to 0. */
	new_load->src[nir_get_io_offset_src_number(intr)] =
	nir_src_for_ssa(zero);

	nir_intrinsic_set_component(new_load, first_comp);

	/* Set barycentrics or the vertex index if we are loading inputs
	* at the beginning.
	*/
	if (src0)
	new_load->src[0] = nir_src_for_ssa(src0);

	/* and set IO semantics to the location of the array element. */
	nir_io_semantics new_sem = sem;
	new_sem.num_slots = 1;

	if (!compact) {
	new_sem.location += i;
	nir_intrinsic_set_base(new_load, nir_intrinsic_base(intr) + i);
	} else {
	new_sem.location += i / 4;
	nir_intrinsic_set_component(new_load, i % 4);
	nir_intrinsic_set_base(new_load, nir_intrinsic_base(intr) + i / 4);
	}

	nir_intrinsic_set_io_semantics(new_load, new_sem);

	nir_builder_instr_insert(b, &new_load->instr);
	elems[i] = &new_load->def;
	}

	/* Put the array elements into a local array variable. */
	enum glsl_base_type base_type =
	intr->def.bit_size == 16 ? GLSL_TYPE_FLOAT16 : GLSL_TYPE_FLOAT;
	const glsl_type *type =
	glsl_array_type(glsl_vector_type(base_type, num_comp),
	sem.num_slots, 0);
	if (!array)
	array = &temp; /* loaded at the load instead of the beginning */
	*array = nir_local_variable_create(b->impl, type, "");

	/* Fill the array with the loaded elements. */
	for (unsigned i = 0; i < sem.num_slots; i++) {
	nir_store_array_var_imm(b, *array, i, elems[i],
	BITFIELD_MASK(num_comp));
	}
	}

	b->cursor = nir_before_instr(&intr->instr);

	/* Get the indirect value and upsize the vector to the original size. */
	nir_def value = nir_load_array_var(b, array, array_index.def);

	value = nir_shift_channels(b, value,
	(int)first_comp - (int)intr_component,
	intr->def.num_components);

	nir_def_replace(&intr->def, value);
	progress = true;
	}
	}

	return progress;
	}

	static bool
	lower_indirect_loads(nir_function_impl *impl, nir_variable_mode modes)
	{
	lower_io_indir_loads_state state = calloc(1, sizeof(state));
	state->modes = modes;

	if (modes & nir_var_shader_in) {
	nir_function_intrinsics_pass(impl, gather_indirect_inputs,
	nir_metadata_all, state);
	}

	bool progress = nir_function_intrinsics_pass(impl, lower_load,
	nir_metadata_control_flow, state);
	free(state);
	return progress;
	}

	bool
	nir_lower_io_indirect_loads(nir_shader *nir, nir_variable_mode modes)
	{
	assert(modes & (nir_var_shader_in \| nir_var_shader_out));
	assert(!(modes & ~(nir_var_shader_in \| nir_var_shader_out)));
	assert((!(modes & nir_var_shader_out) \|\|
	nir->info.stage == MESA_SHADER_TESS_CTRL) &&
	"output lowering only supported for TCS");
	assert(nir->info.stage <= MESA_SHADER_FRAGMENT);

	bool progress = false;
	nir_foreach_function_impl(impl, nir) {
	progress \|= lower_indirect_loads(impl, modes);
	}

	return progress;
	}