src/compiler/nir/nir_opt_vectorize_io.c - third_party/mesa - Git at Google

 /*
  * Copyright 2024 Advanced Micro Devices, Inc.
  *
  * SPDX-License-Identifier: MIT
  */

 /**
  * This pass:
  * - vectorizes lowered input/output loads and stores
  * - vectorizes low and high 16-bit loads and stores by merging them into
  *   a single 32-bit load or store (except load_interpolated_input and XFB,
  *   which have to keep bit_size=16)
  * - performs DCE of output stores that overwrite the previous value by writing
  *   into the same slot and component.
  *
  * Vectorization is only local within basic blocks. No vectorization occurs
  * across basic block boundaries, barriers (only TCS outputs), emits (only
  * GS outputs), and output load <-> output store dependencies.
  *
  * All loads and stores must be scalar. 64-bit loads and stores are forbidden.
  *
  * For each basic block, the time complexity is O(n*log(n)) where n is
  * the number of IO instructions within that block.
  */

 #include "util/u_dynarray.h"
 #include "nir.h"
 #include "nir_builder.h"

 /* Return 0 if loads/stores are vectorizable. Return 1 or -1 to define
  * an ordering between non-vectorizable instructions. This is used by qsort,
  * to sort all gathered instructions into groups of vectorizable instructions.
  */
 static int
 compare_is_not_vectorizable(nir_intrinsic_instr *a, nir_intrinsic_instr *b)
 {
    if (a->intrinsic != b->intrinsic)
       return a->intrinsic > b->intrinsic ? 1 : -1;

    nir_src *offset0 = nir_get_io_offset_src(a);
    nir_src *offset1 = nir_get_io_offset_src(b);
    if (offset0 && offset0->ssa != offset1->ssa)
       return offset0->ssa->index > offset1->ssa->index ? 1 : -1;

    nir_src *array_idx0 = nir_get_io_arrayed_index_src(a);
    nir_src *array_idx1 = nir_get_io_arrayed_index_src(b);
    if (array_idx0 && array_idx0->ssa != array_idx1->ssa)
       return array_idx0->ssa->index > array_idx1->ssa->index ? 1 : -1;

    /* Compare barycentrics or vertex index. */
    if ((a->intrinsic == nir_intrinsic_load_interpolated_input ||
         a->intrinsic == nir_intrinsic_load_input_vertex) &&
        a->src[0].ssa != b->src[0].ssa)
       return a->src[0].ssa->index > b->src[0].ssa->index ? 1 : -1;

    nir_io_semantics sem0 = nir_intrinsic_io_semantics(a);
    nir_io_semantics sem1 = nir_intrinsic_io_semantics(b);
    if (sem0.location != sem1.location)
       return sem0.location > sem1.location ? 1 : -1;

    /* The mediump flag isn't mergable. */
    if (sem0.medium_precision != sem1.medium_precision)
       return sem0.medium_precision > sem1.medium_precision ? 1 : -1;

    /* Don't merge per-view attributes with non-per-view attributes. */
    if (sem0.per_view != sem1.per_view)
       return sem0.per_view > sem1.per_view ? 1 : -1;

    if (sem0.interp_explicit_strict != sem1.interp_explicit_strict)
       return sem0.interp_explicit_strict > sem1.interp_explicit_strict ? 1 : -1;

    /* Only load_interpolated_input can't merge low and high halves of 16-bit
     * loads/stores.
     */
    if (a->intrinsic == nir_intrinsic_load_interpolated_input &&
        sem0.high_16bits != sem1.high_16bits)
       return sem0.high_16bits > sem1.high_16bits ? 1 : -1;

    /* TODO: vectorize (f32, f32, f16vec2, f16vec2) -> vec4
     * For now, different bit sizes are not vectorized together.
     */
    if (nir_intrinsic_has_src_type(a)) {
       /* Stores. */
       if (a->src[0].ssa->bit_size != b->src[0].ssa->bit_size)
          return a->src[0].ssa->bit_size > b->src[0].ssa->bit_size ? 1 : -1;
    } else {
       /* Loads. */
       if (a->def.bit_size != b->def.bit_size)
          return a->def.bit_size > b->def.bit_size ? 1 : -1;
    }

    nir_shader *shader =
       nir_cf_node_get_function(&a->instr.block->cf_node)->function->shader;

    /* Compare the types. */
    if (!(shader->options->io_options & nir_io_vectorizer_ignores_types)) {
       unsigned type_a, type_b;

       if (nir_intrinsic_has_src_type(a)) {
          type_a = nir_intrinsic_src_type(a);
          type_b = nir_intrinsic_src_type(b);
       } else {
          type_a = nir_intrinsic_dest_type(a);
          type_b = nir_intrinsic_dest_type(b);
       }

       if (type_a != type_b)
          return type_a > type_b ? 1 : -1;
    }

    return 0;
 }

 static int
 compare_intr(const void *xa, const void *xb)
 {
    nir_intrinsic_instr *a = *(nir_intrinsic_instr **)xa;
    nir_intrinsic_instr *b = *(nir_intrinsic_instr **)xb;

    int comp = compare_is_not_vectorizable(a, b);
    if (comp)
       return comp;

    /* qsort isn't stable. This ensures that later stores aren't moved before earlier stores. */
    return a->instr.index > b->instr.index ? 1 : -1;
 }

 typedef enum {
    merge_low_high_16_to_32,
    vectorize_high_16_separately,
    vectorize_the_rest,
 } nir_vectorize_op_step;

 static bool
 apply_radv_workaround(nir_builder *b)
 {
    return b->shader->options->io_options &
           nir_io_radv_intrinsic_component_workaround;
 }

 static void
 vectorize_load(nir_intrinsic_instr *chan[8], unsigned start, unsigned count,
                nir_vectorize_op_step step)
 {
    nir_intrinsic_instr *first = NULL;

    /* Find the first instruction where the vectorized load will be
     * inserted.
     */
    for (unsigned i = start; i < start + count; i++) {
       if (!chan[i])
          continue;

       first = !first || chan[i]->instr.index < first->instr.index ? chan[i] : first;
       if (step == merge_low_high_16_to_32) {
          first = !first || chan[4 + i]->instr.index < first->instr.index ? chan[4 + i] : first;
       }
    }

    /* Insert the vectorized load. */
    nir_builder b = nir_builder_at(nir_before_instr(&first->instr));
    nir_intrinsic_instr *new_intr =
       nir_intrinsic_instr_create(b.shader, first->intrinsic);

    new_intr->num_components = count;
    nir_def_init(&new_intr->instr, &new_intr->def, count,
                 step == merge_low_high_16_to_32 ? 32 : first->def.bit_size);
    memcpy(new_intr->src, first->src,
           nir_intrinsic_infos[first->intrinsic].num_srcs * sizeof(nir_src));
    nir_intrinsic_copy_const_indices(new_intr, first);
    if (apply_radv_workaround(&b))
       nir_intrinsic_set_component(new_intr, start);
    else
       nir_intrinsic_set_component(new_intr, start & 0x3); /* Bits 4..7 should map to 0..3 */
    assert(start % 4 + count <= 4);

    nir_io_semantics sem = nir_intrinsic_io_semantics(new_intr);

    if (step == vectorize_high_16_separately) {
       assert(start >= 4);
       sem.high_16bits = 1;
    } else {
       assert(start <= 3);
    }

    if (step == merge_low_high_16_to_32) {
       sem.high_16bits = 0;
       nir_intrinsic_set_dest_type(new_intr,
                                   (nir_intrinsic_dest_type(new_intr) & ~16) | 32);
    }

    nir_intrinsic_set_io_semantics(new_intr, sem);

    nir_builder_instr_insert(&b, &new_intr->instr);
    nir_def *def = &new_intr->def;

    /* Replace the scalar loads. */
    if (step == merge_low_high_16_to_32) {
       for (unsigned i = start; i < start + count; i++) {
          nir_def *comp = nir_channel(&b, def, i - start);

          nir_def_rewrite_uses(&chan[i]->def,
                               nir_unpack_32_2x16_split_x(&b, comp));
          nir_def_rewrite_uses(&chan[4 + i]->def,
                               nir_unpack_32_2x16_split_y(&b, comp));
          nir_instr_remove(&chan[i]->instr);
          nir_instr_remove(&chan[4 + i]->instr);
       }
    } else {
       for (unsigned i = start; i < start + count; i++) {
          if (chan[i])
             nir_def_replace(&chan[i]->def, nir_channel(&b, def, i - start));
       }
    }
 }

 static void
 vectorize_store(nir_intrinsic_instr *chan[8], unsigned start, unsigned count,
                 nir_vectorize_op_step step)
 {
    nir_intrinsic_instr *last = NULL;

    /* Find the last instruction where the vectorized store will be
     * inserted.
     */
    for (unsigned i = start; i < start + count; i++) {
       last = !last || chan[i]->instr.index > last->instr.index ? chan[i] : last;
       if (step == merge_low_high_16_to_32) {
          last = !last || chan[4 + i]->instr.index > last->instr.index ? chan[4 + i] : last;
       }
    }

    /* Change the last instruction to a vectorized store. Update xfb first
     * because we need to read some info from "last" before overwriting it.
     */
    if (nir_intrinsic_has_io_xfb(last)) {
       /* 0 = low/full XY channels
        * 1 = low/full ZW channels
        * 2 = high XY channels
        * 3 = high ZW channels
        */
       nir_io_xfb xfb[4] = { { { { 0 } } } };

       for (unsigned i = start; i < start + count; i++) {
          xfb[i / 2].out[i % 2] =
             ((i % 4) < 2 ? nir_intrinsic_io_xfb(chan[i]) : nir_intrinsic_io_xfb2(chan[i])).out[i % 2];

          /* Merging low and high 16 bits to 32 bits is not possible
           * with xfb in some cases.
           */
          assert(!xfb[i / 2].out[i % 2].num_components ||
                 step != merge_low_high_16_to_32);
       }

       /* Now vectorize xfb info by merging the individual elements. */
       for (unsigned i = start; i < start + count; i++) {
          /* mediump means that xfb upconverts to 32 bits when writing to
           * memory.
           */
          unsigned xfb_comp_size =
             nir_intrinsic_io_semantics(chan[i]).medium_precision ? 32 : chan[i]->src[0].ssa->bit_size;

          for (unsigned j = i + 1; j < start + count; j++) {
             if (xfb[i / 2].out[i % 2].buffer != xfb[j / 2].out[j % 2].buffer ||
                 xfb[i / 2].out[i % 2].offset != xfb[j / 2].out[j % 2].offset +
                                                    xfb_comp_size * (j - i))
                break;

             xfb[i / 2].out[i % 2].num_components++;
             memset(&xfb[j / 2].out[j % 2], 0, sizeof(xfb[j / 2].out[j % 2]));
          }
       }

       if (start >= 4) {
          nir_intrinsic_set_io_xfb(last, xfb[2]);
          nir_intrinsic_set_io_xfb2(last, xfb[3]);
       } else {
          assert(start + count <= 4);
          nir_intrinsic_set_io_xfb(last, xfb[0]);
          nir_intrinsic_set_io_xfb2(last, xfb[1]);
       }
    }

    /* Update gs_streams. */
    unsigned gs_streams = 0;
    for (unsigned i = start; i < start + count; i++) {
       gs_streams |= (nir_intrinsic_io_semantics(chan[i]).gs_streams & 0x3) << ((i - start) * 2);
    }

    nir_io_semantics sem = nir_intrinsic_io_semantics(last);
    sem.gs_streams = gs_streams;

    if (step == vectorize_high_16_separately) {
       assert(start >= 4);
       sem.high_16bits = 1;
    } else {
       assert(start <= 3);
    }

    /* Update other flags. */
    for (unsigned i = start; i < start + count; i++) {
       if (!nir_intrinsic_io_semantics(chan[i]).no_sysval_output)
          sem.no_sysval_output = 0;
       if (!nir_intrinsic_io_semantics(chan[i]).no_varying)
          sem.no_varying = 0;
    }

    if (step == merge_low_high_16_to_32) {
       /* Update "no" flags for high bits. */
       for (unsigned i = start; i < start + count; i++) {
          if (!nir_intrinsic_io_semantics(chan[4 + i]).no_sysval_output)
             sem.no_sysval_output = 0;
          if (!nir_intrinsic_io_semantics(chan[4 + i]).no_varying)
             sem.no_varying = 0;
       }

       /* Update the type. */
       sem.high_16bits = 0;
       nir_intrinsic_set_src_type(last,
                                  (nir_intrinsic_src_type(last) & ~16) | 32);
    }

    /* TODO: Merge names? */

    nir_builder b = nir_builder_at(nir_before_instr(&last->instr));

    /* Update the rest. */
    nir_intrinsic_set_io_semantics(last, sem);
    if (apply_radv_workaround(&b))
       nir_intrinsic_set_component(last, start);
    else
       nir_intrinsic_set_component(last, start & 0x3); /* Bits 4..7 should map to 0..3 */
    assert(start % 4 + count <= 4);
    nir_intrinsic_set_write_mask(last, BITFIELD_MASK(count));
    last->num_components = count;

    /* Replace the stored scalar with the vector. */
    if (step == merge_low_high_16_to_32) {
       nir_def *value[4];
       for (unsigned i = start; i < start + count; i++) {
          value[i] = nir_pack_32_2x16_split(&b, chan[i]->src[0].ssa,
                                            chan[4 + i]->src[0].ssa);
       }

       nir_src_rewrite(&last->src[0], nir_vec(&b, &value[start], count));
    } else {
       nir_def *value[8];
       for (unsigned i = start; i < start + count; i++)
          value[i] = chan[i]->src[0].ssa;

       nir_src_rewrite(&last->src[0], nir_vec(&b, &value[start], count));
    }

    /* Remove the scalar stores. */
    for (unsigned i = start; i < start + count; i++) {
       if (chan[i] != last)
          nir_instr_remove(&chan[i]->instr);
       if (step == merge_low_high_16_to_32 && chan[4 + i] != last)
          nir_instr_remove(&chan[4 + i]->instr);
    }
 }

 /* Vectorize a vector of scalar instructions. chan[8] are the channels.
  * (the last 4 are the high 16-bit channels)
  */
 static bool
 vectorize_slot(nir_intrinsic_instr *chan[8], unsigned mask, bool allow_holes)
 {
    bool progress = false;
    assert(mask);
    bool is_load = nir_intrinsic_infos[chan[ffs(mask) - 1]->intrinsic].has_dest;

    /* First, merge low and high 16-bit halves into 32 bits separately when
     * possible. Then vectorize what's left.
     */
    for (nir_vectorize_op_step step = merge_low_high_16_to_32;
         step <= vectorize_the_rest; step++) {
       unsigned scan_mask;

       if (step == merge_low_high_16_to_32) {
          /* Get the subset of the mask where both low and high bits are set. */
          scan_mask = 0;
          for (unsigned i = 0; i < 4; i++) {
             unsigned low_high_bits = BITFIELD_BIT(i) | BITFIELD_BIT(i + 4);

             if ((mask & low_high_bits) == low_high_bits) {
                /* Merging low and high 16 bits to 32 bits is not possible
                 * with xfb in some cases.
                 */
                if (nir_intrinsic_has_io_xfb(chan[i])) {
                   unsigned hi = i + 4;

                   if ((i < 2 ? nir_intrinsic_io_xfb(chan[i])
                              : nir_intrinsic_io_xfb2(chan[i]))
                          .out[i % 2]
                          .num_components ||
                       (i < 2 ? nir_intrinsic_io_xfb(chan[hi])
                              : nir_intrinsic_io_xfb2(chan[hi]))
                          .out[i % 2]
                          .num_components)
                      continue;
                }

                /* The GS stream must be the same for both halves. */
                if ((nir_intrinsic_io_semantics(chan[i]).gs_streams & 0x3) !=
                    (nir_intrinsic_io_semantics(chan[4 + i]).gs_streams & 0x3))
                   continue;

                scan_mask |= BITFIELD_BIT(i);
                mask &= ~low_high_bits;
             }
          }
       } else if (step == vectorize_high_16_separately) {
          scan_mask = mask & BITFIELD_RANGE(4, 4);
          mask &= ~scan_mask;

          if (is_load && allow_holes) {
             unsigned num = util_last_bit(scan_mask);
             scan_mask = BITFIELD_RANGE(4, num - 4);
          }
       } else {
          scan_mask = mask;

          if (is_load && allow_holes) {
             unsigned num = util_last_bit(scan_mask);
             scan_mask = BITFIELD_MASK(num);
          }
       }

       while (scan_mask) {
          int start, count;

          u_bit_scan_consecutive_range(&scan_mask, &start, &count);

          if (count == 1 && step != merge_low_high_16_to_32)
             continue; /* There is nothing to vectorize. */

          if (is_load)
             vectorize_load(chan, start, count, step);
          else
             vectorize_store(chan, start, count, step);

          progress = true;
       }
    }

    return progress;
 }

 static bool
 vectorize_batch(struct util_dynarray *io_instructions, bool allow_holes)
 {
    unsigned num_instr = util_dynarray_num_elements(io_instructions, void *);

    /* We need to at least 2 instructions to have something to do. */
    if (num_instr <= 1) {
       /* Clear the array. The next block will reuse it. */
       util_dynarray_clear(io_instructions);
       return false;
    }

    /* The instructions are sorted such that groups of vectorizable
     * instructions are next to each other. Multiple incompatible
     * groups of vectorizable instructions can occur in this array.
     * The reason why 2 groups would be incompatible is that they
     * could have a different intrinsic, indirect index, array index,
     * vertex index, barycentrics, or location. Each group is vectorized
     * separately.
     *
     * This reorders instructions in the array, but not in the shader.
     */
    qsort(io_instructions->data, num_instr, sizeof(void *), compare_intr);

    nir_intrinsic_instr *chan[8] = { 0 }, *prev = NULL;
    unsigned chan_mask = 0;
    bool progress = false;

    /* Vectorize all groups.
     *
     * The channels for each group are gathered. If 2 stores overwrite
     * the same channel, the earlier store is DCE'd here.
     */
    util_dynarray_foreach(io_instructions, nir_intrinsic_instr *, intr) {
       /* If the next instruction is not vectorizable, vectorize what
        * we have gathered so far.
        */
       if (prev && compare_is_not_vectorizable(prev, *intr)) {
          /* We need at least 2 instructions to have something to do. */
          if (util_bitcount(chan_mask) > 1)
             progress |= vectorize_slot(chan, chan_mask, allow_holes);

          prev = NULL;
          memset(chan, 0, sizeof(chan));
          chan_mask = 0;
       }

       /* This performs DCE of output stores because the previous value
        * is being overwritten.
        */
       unsigned index = nir_intrinsic_io_semantics(*intr).high_16bits * 4 +
                        nir_intrinsic_component(*intr);
       bool is_store = !nir_intrinsic_infos[(*intr)->intrinsic].has_dest;
       if (is_store && chan[index])
          nir_instr_remove(&chan[index]->instr);

       /* Gather the channel. */
       chan[index] = *intr;
       prev = *intr;
       chan_mask |= BITFIELD_BIT(index);
    }

    /* Vectorize the last group. */
    if (prev && util_bitcount(chan_mask) > 1)
       progress |= vectorize_slot(chan, chan_mask, allow_holes);

    /* Clear the array. The next block will reuse it. */
    util_dynarray_clear(io_instructions);
    return progress;
 }

 /* Vectorize lowered IO (load_input/store_output/...).
  *
  * modes specifies whether to vectorize inputs and/or outputs.
  *
  * allow_holes enables vectorization of loads with holes, e.g.:
  *    load X; load W; ==> load XYZW;
  *
  *    This is useful for VS input loads where it might not be possible to skip
  *    loading unused components, e.g. with AMD where loading W also loads XYZ,
  *    so if we also load X separately again, it's wasteful. It's better to get
  *    X from the vector that loads (XYZ)W.
  */
 bool
 nir_opt_vectorize_io(nir_shader *shader, nir_variable_mode modes,
                      bool allow_holes)
 {
    assert(!(modes & ~(nir_var_shader_in | nir_var_shader_out)));

    if (shader->info.stage == MESA_SHADER_FRAGMENT &&
        shader->options->io_options & nir_io_prefer_scalar_fs_inputs)
       modes &= ~nir_var_shader_in;

    if ((shader->info.stage == MESA_SHADER_TESS_CTRL ||
         shader->info.stage == MESA_SHADER_GEOMETRY) &&
        util_bitcount(modes) == 2) {
       /* When vectorizing TCS and GS IO, inputs can ignore barriers and emits,
        * but that is only done when outputs are ignored, so vectorize them
        * separately.
        */
       bool progress_in = nir_opt_vectorize_io(shader, nir_var_shader_in,
                                               allow_holes);
       bool progress_out = nir_opt_vectorize_io(shader, nir_var_shader_out,
                                                allow_holes);
       return progress_in || progress_out;
    }

    /* Initialize dynamic arrays. */
    struct util_dynarray io_instructions;
    util_dynarray_init(&io_instructions, NULL);
    bool global_progress = false;

    nir_foreach_function_impl(impl, shader) {
       bool progress = false;
       nir_metadata_require(impl, nir_metadata_instr_index);

       nir_foreach_block(block, impl) {
          BITSET_DECLARE(has_output_loads, NUM_TOTAL_VARYING_SLOTS * 8);
          BITSET_DECLARE(has_output_stores, NUM_TOTAL_VARYING_SLOTS * 8);
          BITSET_ZERO(has_output_loads);
          BITSET_ZERO(has_output_stores);

          /* Gather load/store intrinsics within the block. */
          nir_foreach_instr(instr, block) {
             if (instr->type != nir_instr_type_intrinsic)
                continue;

             nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
             bool is_load = nir_intrinsic_infos[intr->intrinsic].has_dest;
             bool is_output = false;
             nir_io_semantics sem = { 0 };
             unsigned index = 0;

             if (nir_intrinsic_has_io_semantics(intr)) {
                sem = nir_intrinsic_io_semantics(intr);
                assert(sem.location < NUM_TOTAL_VARYING_SLOTS);
                index = sem.location * 8 + sem.high_16bits * 4 +
                        nir_intrinsic_component(intr);
             }

             switch (intr->intrinsic) {
             case nir_intrinsic_load_input:
             case nir_intrinsic_load_per_primitive_input:
             case nir_intrinsic_load_input_vertex:
             case nir_intrinsic_load_interpolated_input:
             case nir_intrinsic_load_per_vertex_input:
                if (!(modes & nir_var_shader_in))
                   continue;
                break;

             case nir_intrinsic_load_output:
             case nir_intrinsic_load_per_vertex_output:
             case nir_intrinsic_load_per_view_output:
             case nir_intrinsic_load_per_primitive_output:
             case nir_intrinsic_store_output:
             case nir_intrinsic_store_per_vertex_output:
             case nir_intrinsic_store_per_view_output:
             case nir_intrinsic_store_per_primitive_output:
                if (!(modes & nir_var_shader_out))
                   continue;

                /* Break the batch if an output load is followed by an output
                 * store to the same channel and vice versa.
                 */
                if (BITSET_TEST(is_load ? has_output_stores : has_output_loads,
                                index)) {
                   progress |= vectorize_batch(&io_instructions, allow_holes);
                   BITSET_ZERO(has_output_loads);
                   BITSET_ZERO(has_output_stores);
                }
                is_output = true;
                break;

             case nir_intrinsic_barrier:
                /* Don't vectorize across TCS barriers. */
                if (modes & nir_var_shader_out &&
                    nir_intrinsic_memory_modes(intr) & nir_var_shader_out) {
                   progress |= vectorize_batch(&io_instructions, allow_holes);
                   BITSET_ZERO(has_output_loads);
                   BITSET_ZERO(has_output_stores);
                }
                continue;

             case nir_intrinsic_emit_vertex:
                /* Don't vectorize across GS emits. */
                progress |= vectorize_batch(&io_instructions, allow_holes);
                BITSET_ZERO(has_output_loads);
                BITSET_ZERO(has_output_stores);
                continue;

             default:
                continue;
             }

             /* Only scalar 16 and 32-bit instructions are allowed. */
             ASSERTED nir_def *value = is_load ? &intr->def : intr->src[0].ssa;
             assert(value->num_components == 1);
             assert(value->bit_size == 16 || value->bit_size == 32);

             util_dynarray_append(&io_instructions, void *, intr);
             if (is_output)
                BITSET_SET(is_load ? has_output_loads : has_output_stores, index);
          }

          progress |= vectorize_batch(&io_instructions, allow_holes);
       }

       nir_progress(progress, impl,
                    nir_metadata_block_index | nir_metadata_dominance);
       global_progress |= progress;
    }
    util_dynarray_fini(&io_instructions);

    return global_progress;
 }
	/*
	* Copyright 2024 Advanced Micro Devices, Inc.
	*
	* SPDX-License-Identifier: MIT
	*/

	/**
	* This pass:
	* - vectorizes lowered input/output loads and stores
	* - vectorizes low and high 16-bit loads and stores by merging them into
	* a single 32-bit load or store (except load_interpolated_input and XFB,
	* which have to keep bit_size=16)
	* - performs DCE of output stores that overwrite the previous value by writing
	* into the same slot and component.
	*
	* Vectorization is only local within basic blocks. No vectorization occurs
	* across basic block boundaries, barriers (only TCS outputs), emits (only
	* GS outputs), and output load <-> output store dependencies.
	*
	* All loads and stores must be scalar. 64-bit loads and stores are forbidden.
	*
	* For each basic block, the time complexity is O(n*log(n)) where n is
	* the number of IO instructions within that block.
	*/

	#include "util/u_dynarray.h"
	#include "nir.h"
	#include "nir_builder.h"

	/* Return 0 if loads/stores are vectorizable. Return 1 or -1 to define
	* an ordering between non-vectorizable instructions. This is used by qsort,
	* to sort all gathered instructions into groups of vectorizable instructions.
	*/
	static int
	compare_is_not_vectorizable(nir_intrinsic_instr a, nir_intrinsic_instr b)
	{
	if (a->intrinsic != b->intrinsic)
	return a->intrinsic > b->intrinsic ? 1 : -1;

	nir_src *offset0 = nir_get_io_offset_src(a);
	nir_src *offset1 = nir_get_io_offset_src(b);
	if (offset0 && offset0->ssa != offset1->ssa)
	return offset0->ssa->index > offset1->ssa->index ? 1 : -1;

	nir_src *array_idx0 = nir_get_io_arrayed_index_src(a);
	nir_src *array_idx1 = nir_get_io_arrayed_index_src(b);
	if (array_idx0 && array_idx0->ssa != array_idx1->ssa)
	return array_idx0->ssa->index > array_idx1->ssa->index ? 1 : -1;

	/* Compare barycentrics or vertex index. */
	if ((a->intrinsic == nir_intrinsic_load_interpolated_input \|\|
	a->intrinsic == nir_intrinsic_load_input_vertex) &&
	a->src[0].ssa != b->src[0].ssa)
	return a->src[0].ssa->index > b->src[0].ssa->index ? 1 : -1;

	nir_io_semantics sem0 = nir_intrinsic_io_semantics(a);
	nir_io_semantics sem1 = nir_intrinsic_io_semantics(b);
	if (sem0.location != sem1.location)
	return sem0.location > sem1.location ? 1 : -1;

	/* The mediump flag isn't mergable. */
	if (sem0.medium_precision != sem1.medium_precision)
	return sem0.medium_precision > sem1.medium_precision ? 1 : -1;

	/* Don't merge per-view attributes with non-per-view attributes. */
	if (sem0.per_view != sem1.per_view)
	return sem0.per_view > sem1.per_view ? 1 : -1;

	if (sem0.interp_explicit_strict != sem1.interp_explicit_strict)
	return sem0.interp_explicit_strict > sem1.interp_explicit_strict ? 1 : -1;

	/* Only load_interpolated_input can't merge low and high halves of 16-bit
	* loads/stores.
	*/
	if (a->intrinsic == nir_intrinsic_load_interpolated_input &&
	sem0.high_16bits != sem1.high_16bits)
	return sem0.high_16bits > sem1.high_16bits ? 1 : -1;

	/* TODO: vectorize (f32, f32, f16vec2, f16vec2) -> vec4
	* For now, different bit sizes are not vectorized together.
	*/
	if (nir_intrinsic_has_src_type(a)) {
	/* Stores. */
	if (a->src[0].ssa->bit_size != b->src[0].ssa->bit_size)
	return a->src[0].ssa->bit_size > b->src[0].ssa->bit_size ? 1 : -1;
	} else {
	/* Loads. */
	if (a->def.bit_size != b->def.bit_size)
	return a->def.bit_size > b->def.bit_size ? 1 : -1;
	}

	nir_shader *shader =
	nir_cf_node_get_function(&a->instr.block->cf_node)->function->shader;

	/* Compare the types. */
	if (!(shader->options->io_options & nir_io_vectorizer_ignores_types)) {
	unsigned type_a, type_b;

	if (nir_intrinsic_has_src_type(a)) {
	type_a = nir_intrinsic_src_type(a);
	type_b = nir_intrinsic_src_type(b);
	} else {
	type_a = nir_intrinsic_dest_type(a);
	type_b = nir_intrinsic_dest_type(b);
	}

	if (type_a != type_b)
	return type_a > type_b ? 1 : -1;
	}

	return 0;
	}

	static int
	compare_intr(const void xa, const void xb)
	{
	nir_intrinsic_instr a = (nir_intrinsic_instr **)xa;
	nir_intrinsic_instr b = (nir_intrinsic_instr **)xb;

	int comp = compare_is_not_vectorizable(a, b);
	if (comp)
	return comp;

	/* qsort isn't stable. This ensures that later stores aren't moved before earlier stores. */
	return a->instr.index > b->instr.index ? 1 : -1;
	}

	typedef enum {
	merge_low_high_16_to_32,
	vectorize_high_16_separately,
	vectorize_the_rest,
	} nir_vectorize_op_step;

	static bool
	apply_radv_workaround(nir_builder *b)
	{
	return b->shader->options->io_options &
	nir_io_radv_intrinsic_component_workaround;
	}

	static void
	vectorize_load(nir_intrinsic_instr *chan[8], unsigned start, unsigned count,
	nir_vectorize_op_step step)
	{
	nir_intrinsic_instr *first = NULL;

	/* Find the first instruction where the vectorized load will be
	* inserted.
	*/
	for (unsigned i = start; i < start + count; i++) {
	if (!chan[i])
	continue;

	first = !first \|\| chan[i]->instr.index < first->instr.index ? chan[i] : first;
	if (step == merge_low_high_16_to_32) {
	first = !first \|\| chan[4 + i]->instr.index < first->instr.index ? chan[4 + i] : first;
	}
	}

	/* Insert the vectorized load. */
	nir_builder b = nir_builder_at(nir_before_instr(&first->instr));
	nir_intrinsic_instr *new_intr =
	nir_intrinsic_instr_create(b.shader, first->intrinsic);

	new_intr->num_components = count;
	nir_def_init(&new_intr->instr, &new_intr->def, count,
	step == merge_low_high_16_to_32 ? 32 : first->def.bit_size);
	memcpy(new_intr->src, first->src,
	nir_intrinsic_infos[first->intrinsic].num_srcs * sizeof(nir_src));
	nir_intrinsic_copy_const_indices(new_intr, first);
	if (apply_radv_workaround(&b))
	nir_intrinsic_set_component(new_intr, start);
	else
	nir_intrinsic_set_component(new_intr, start & 0x3); /* Bits 4..7 should map to 0..3 */
	assert(start % 4 + count <= 4);

	nir_io_semantics sem = nir_intrinsic_io_semantics(new_intr);

	if (step == vectorize_high_16_separately) {
	assert(start >= 4);
	sem.high_16bits = 1;
	} else {
	assert(start <= 3);
	}

	if (step == merge_low_high_16_to_32) {
	sem.high_16bits = 0;
	nir_intrinsic_set_dest_type(new_intr,
	(nir_intrinsic_dest_type(new_intr) & ~16) \| 32);
	}

	nir_intrinsic_set_io_semantics(new_intr, sem);

	nir_builder_instr_insert(&b, &new_intr->instr);
	nir_def *def = &new_intr->def;

	/* Replace the scalar loads. */
	if (step == merge_low_high_16_to_32) {
	for (unsigned i = start; i < start + count; i++) {
	nir_def *comp = nir_channel(&b, def, i - start);

	nir_def_rewrite_uses(&chan[i]->def,
	nir_unpack_32_2x16_split_x(&b, comp));
	nir_def_rewrite_uses(&chan[4 + i]->def,
	nir_unpack_32_2x16_split_y(&b, comp));
	nir_instr_remove(&chan[i]->instr);
	nir_instr_remove(&chan[4 + i]->instr);
	}
	} else {
	for (unsigned i = start; i < start + count; i++) {
	if (chan[i])
	nir_def_replace(&chan[i]->def, nir_channel(&b, def, i - start));
	}
	}
	}

	static void
	vectorize_store(nir_intrinsic_instr *chan[8], unsigned start, unsigned count,
	nir_vectorize_op_step step)
	{
	nir_intrinsic_instr *last = NULL;

	/* Find the last instruction where the vectorized store will be
	* inserted.
	*/
	for (unsigned i = start; i < start + count; i++) {
	last = !last \|\| chan[i]->instr.index > last->instr.index ? chan[i] : last;
	if (step == merge_low_high_16_to_32) {
	last = !last \|\| chan[4 + i]->instr.index > last->instr.index ? chan[4 + i] : last;
	}
	}

	/* Change the last instruction to a vectorized store. Update xfb first
	* because we need to read some info from "last" before overwriting it.
	*/
	if (nir_intrinsic_has_io_xfb(last)) {
	/* 0 = low/full XY channels
	* 1 = low/full ZW channels
	* 2 = high XY channels
	* 3 = high ZW channels
	*/
	nir_io_xfb xfb[4] = { { { { 0 } } } };

	for (unsigned i = start; i < start + count; i++) {
	xfb[i / 2].out[i % 2] =
	((i % 4) < 2 ? nir_intrinsic_io_xfb(chan[i]) : nir_intrinsic_io_xfb2(chan[i])).out[i % 2];

	/* Merging low and high 16 bits to 32 bits is not possible
	* with xfb in some cases.
	*/
	assert(!xfb[i / 2].out[i % 2].num_components \|\|
	step != merge_low_high_16_to_32);
	}

	/* Now vectorize xfb info by merging the individual elements. */
	for (unsigned i = start; i < start + count; i++) {
	/* mediump means that xfb upconverts to 32 bits when writing to
	* memory.
	*/
	unsigned xfb_comp_size =
	nir_intrinsic_io_semantics(chan[i]).medium_precision ? 32 : chan[i]->src[0].ssa->bit_size;

	for (unsigned j = i + 1; j < start + count; j++) {
	if (xfb[i / 2].out[i % 2].buffer != xfb[j / 2].out[j % 2].buffer \|\|
	xfb[i / 2].out[i % 2].offset != xfb[j / 2].out[j % 2].offset +
	xfb_comp_size * (j - i))
	break;

	xfb[i / 2].out[i % 2].num_components++;
	memset(&xfb[j / 2].out[j % 2], 0, sizeof(xfb[j / 2].out[j % 2]));
	}
	}

	if (start >= 4) {
	nir_intrinsic_set_io_xfb(last, xfb[2]);
	nir_intrinsic_set_io_xfb2(last, xfb[3]);
	} else {
	assert(start + count <= 4);
	nir_intrinsic_set_io_xfb(last, xfb[0]);
	nir_intrinsic_set_io_xfb2(last, xfb[1]);
	}
	}

	/* Update gs_streams. */
	unsigned gs_streams = 0;
	for (unsigned i = start; i < start + count; i++) {
	gs_streams \|= (nir_intrinsic_io_semantics(chan[i]).gs_streams & 0x3) << ((i - start) * 2);
	}

	nir_io_semantics sem = nir_intrinsic_io_semantics(last);
	sem.gs_streams = gs_streams;

	if (step == vectorize_high_16_separately) {
	assert(start >= 4);
	sem.high_16bits = 1;
	} else {
	assert(start <= 3);
	}

	/* Update other flags. */
	for (unsigned i = start; i < start + count; i++) {
	if (!nir_intrinsic_io_semantics(chan[i]).no_sysval_output)
	sem.no_sysval_output = 0;
	if (!nir_intrinsic_io_semantics(chan[i]).no_varying)
	sem.no_varying = 0;
	}

	if (step == merge_low_high_16_to_32) {
	/* Update "no" flags for high bits. */
	for (unsigned i = start; i < start + count; i++) {
	if (!nir_intrinsic_io_semantics(chan[4 + i]).no_sysval_output)
	sem.no_sysval_output = 0;
	if (!nir_intrinsic_io_semantics(chan[4 + i]).no_varying)
	sem.no_varying = 0;
	}

	/* Update the type. */
	sem.high_16bits = 0;
	nir_intrinsic_set_src_type(last,
	(nir_intrinsic_src_type(last) & ~16) \| 32);
	}

	/* TODO: Merge names? */

	nir_builder b = nir_builder_at(nir_before_instr(&last->instr));

	/* Update the rest. */
	nir_intrinsic_set_io_semantics(last, sem);
	if (apply_radv_workaround(&b))
	nir_intrinsic_set_component(last, start);
	else
	nir_intrinsic_set_component(last, start & 0x3); /* Bits 4..7 should map to 0..3 */
	assert(start % 4 + count <= 4);
	nir_intrinsic_set_write_mask(last, BITFIELD_MASK(count));
	last->num_components = count;

	/* Replace the stored scalar with the vector. */
	if (step == merge_low_high_16_to_32) {
	nir_def *value[4];
	for (unsigned i = start; i < start + count; i++) {
	value[i] = nir_pack_32_2x16_split(&b, chan[i]->src[0].ssa,
	chan[4 + i]->src[0].ssa);
	}

	nir_src_rewrite(&last->src[0], nir_vec(&b, &value[start], count));
	} else {
	nir_def *value[8];
	for (unsigned i = start; i < start + count; i++)
	value[i] = chan[i]->src[0].ssa;

	nir_src_rewrite(&last->src[0], nir_vec(&b, &value[start], count));
	}

	/* Remove the scalar stores. */
	for (unsigned i = start; i < start + count; i++) {
	if (chan[i] != last)
	nir_instr_remove(&chan[i]->instr);
	if (step == merge_low_high_16_to_32 && chan[4 + i] != last)
	nir_instr_remove(&chan[4 + i]->instr);
	}
	}

	/* Vectorize a vector of scalar instructions. chan[8] are the channels.
	* (the last 4 are the high 16-bit channels)
	*/
	static bool
	vectorize_slot(nir_intrinsic_instr *chan[8], unsigned mask, bool allow_holes)
	{
	bool progress = false;
	assert(mask);
	bool is_load = nir_intrinsic_infos[chan[ffs(mask) - 1]->intrinsic].has_dest;

	/* First, merge low and high 16-bit halves into 32 bits separately when
	* possible. Then vectorize what's left.
	*/
	for (nir_vectorize_op_step step = merge_low_high_16_to_32;
	step <= vectorize_the_rest; step++) {
	unsigned scan_mask;

	if (step == merge_low_high_16_to_32) {
	/* Get the subset of the mask where both low and high bits are set. */
	scan_mask = 0;
	for (unsigned i = 0; i < 4; i++) {
	unsigned low_high_bits = BITFIELD_BIT(i) \| BITFIELD_BIT(i + 4);

	if ((mask & low_high_bits) == low_high_bits) {
	/* Merging low and high 16 bits to 32 bits is not possible
	* with xfb in some cases.
	*/
	if (nir_intrinsic_has_io_xfb(chan[i])) {
	unsigned hi = i + 4;

	if ((i < 2 ? nir_intrinsic_io_xfb(chan[i])
	: nir_intrinsic_io_xfb2(chan[i]))
	.out[i % 2]
	.num_components \|\|
	(i < 2 ? nir_intrinsic_io_xfb(chan[hi])
	: nir_intrinsic_io_xfb2(chan[hi]))
	.out[i % 2]
	.num_components)
	continue;
	}

	/* The GS stream must be the same for both halves. */
	if ((nir_intrinsic_io_semantics(chan[i]).gs_streams & 0x3) !=
	(nir_intrinsic_io_semantics(chan[4 + i]).gs_streams & 0x3))
	continue;

	scan_mask \|= BITFIELD_BIT(i);
	mask &= ~low_high_bits;
	}
	}
	} else if (step == vectorize_high_16_separately) {
	scan_mask = mask & BITFIELD_RANGE(4, 4);
	mask &= ~scan_mask;

	if (is_load && allow_holes) {
	unsigned num = util_last_bit(scan_mask);
	scan_mask = BITFIELD_RANGE(4, num - 4);
	}
	} else {
	scan_mask = mask;

	if (is_load && allow_holes) {
	unsigned num = util_last_bit(scan_mask);
	scan_mask = BITFIELD_MASK(num);
	}
	}

	while (scan_mask) {
	int start, count;

	u_bit_scan_consecutive_range(&scan_mask, &start, &count);

	if (count == 1 && step != merge_low_high_16_to_32)
	continue; /* There is nothing to vectorize. */

	if (is_load)
	vectorize_load(chan, start, count, step);
	else
	vectorize_store(chan, start, count, step);

	progress = true;
	}
	}

	return progress;
	}

	static bool
	vectorize_batch(struct util_dynarray *io_instructions, bool allow_holes)
	{
	unsigned num_instr = util_dynarray_num_elements(io_instructions, void *);

	/* We need to at least 2 instructions to have something to do. */
	if (num_instr <= 1) {
	/* Clear the array. The next block will reuse it. */
	util_dynarray_clear(io_instructions);
	return false;
	}

	/* The instructions are sorted such that groups of vectorizable
	* instructions are next to each other. Multiple incompatible
	* groups of vectorizable instructions can occur in this array.
	* The reason why 2 groups would be incompatible is that they
	* could have a different intrinsic, indirect index, array index,
	* vertex index, barycentrics, or location. Each group is vectorized
	* separately.
	*
	* This reorders instructions in the array, but not in the shader.
	*/
	qsort(io_instructions->data, num_instr, sizeof(void *), compare_intr);

	nir_intrinsic_instr chan[8] = { 0 }, prev = NULL;
	unsigned chan_mask = 0;
	bool progress = false;

	/* Vectorize all groups.
	*
	* The channels for each group are gathered. If 2 stores overwrite
	* the same channel, the earlier store is DCE'd here.
	*/
	util_dynarray_foreach(io_instructions, nir_intrinsic_instr *, intr) {
	/* If the next instruction is not vectorizable, vectorize what
	* we have gathered so far.
	*/
	if (prev && compare_is_not_vectorizable(prev, *intr)) {
	/* We need at least 2 instructions to have something to do. */
	if (util_bitcount(chan_mask) > 1)
	progress \|= vectorize_slot(chan, chan_mask, allow_holes);

	prev = NULL;
	memset(chan, 0, sizeof(chan));
	chan_mask = 0;
	}

	/* This performs DCE of output stores because the previous value
	* is being overwritten.
	*/
	unsigned index = nir_intrinsic_io_semantics(intr).high_16bits 4 +
	nir_intrinsic_component(*intr);
	bool is_store = !nir_intrinsic_infos[(*intr)->intrinsic].has_dest;
	if (is_store && chan[index])
	nir_instr_remove(&chan[index]->instr);

	/* Gather the channel. */
	chan[index] = *intr;
	prev = *intr;
	chan_mask \|= BITFIELD_BIT(index);
	}

	/* Vectorize the last group. */
	if (prev && util_bitcount(chan_mask) > 1)
	progress \|= vectorize_slot(chan, chan_mask, allow_holes);

	/* Clear the array. The next block will reuse it. */
	util_dynarray_clear(io_instructions);
	return progress;
	}

	/* Vectorize lowered IO (load_input/store_output/...).
	*
	* modes specifies whether to vectorize inputs and/or outputs.
	*
	* allow_holes enables vectorization of loads with holes, e.g.:
	* load X; load W; ==> load XYZW;
	*
	* This is useful for VS input loads where it might not be possible to skip
	* loading unused components, e.g. with AMD where loading W also loads XYZ,
	* so if we also load X separately again, it's wasteful. It's better to get
	* X from the vector that loads (XYZ)W.
	*/
	bool
	nir_opt_vectorize_io(nir_shader *shader, nir_variable_mode modes,
	bool allow_holes)
	{
	assert(!(modes & ~(nir_var_shader_in \| nir_var_shader_out)));

	if (shader->info.stage == MESA_SHADER_FRAGMENT &&
	shader->options->io_options & nir_io_prefer_scalar_fs_inputs)
	modes &= ~nir_var_shader_in;

	if ((shader->info.stage == MESA_SHADER_TESS_CTRL \|\|
	shader->info.stage == MESA_SHADER_GEOMETRY) &&
	util_bitcount(modes) == 2) {
	/* When vectorizing TCS and GS IO, inputs can ignore barriers and emits,
	* but that is only done when outputs are ignored, so vectorize them
	* separately.
	*/
	bool progress_in = nir_opt_vectorize_io(shader, nir_var_shader_in,
	allow_holes);
	bool progress_out = nir_opt_vectorize_io(shader, nir_var_shader_out,
	allow_holes);
	return progress_in \|\| progress_out;
	}

	/* Initialize dynamic arrays. */
	struct util_dynarray io_instructions;
	util_dynarray_init(&io_instructions, NULL);
	bool global_progress = false;

	nir_foreach_function_impl(impl, shader) {
	bool progress = false;
	nir_metadata_require(impl, nir_metadata_instr_index);

	nir_foreach_block(block, impl) {
	BITSET_DECLARE(has_output_loads, NUM_TOTAL_VARYING_SLOTS * 8);
	BITSET_DECLARE(has_output_stores, NUM_TOTAL_VARYING_SLOTS * 8);
	BITSET_ZERO(has_output_loads);
	BITSET_ZERO(has_output_stores);

	/* Gather load/store intrinsics within the block. */
	nir_foreach_instr(instr, block) {
	if (instr->type != nir_instr_type_intrinsic)
	continue;

	nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
	bool is_load = nir_intrinsic_infos[intr->intrinsic].has_dest;
	bool is_output = false;
	nir_io_semantics sem = { 0 };
	unsigned index = 0;

	if (nir_intrinsic_has_io_semantics(intr)) {
	sem = nir_intrinsic_io_semantics(intr);
	assert(sem.location < NUM_TOTAL_VARYING_SLOTS);
	index = sem.location * 8 + sem.high_16bits * 4 +
	nir_intrinsic_component(intr);
	}

	switch (intr->intrinsic) {
	case nir_intrinsic_load_input:
	case nir_intrinsic_load_per_primitive_input:
	case nir_intrinsic_load_input_vertex:
	case nir_intrinsic_load_interpolated_input:
	case nir_intrinsic_load_per_vertex_input:
	if (!(modes & nir_var_shader_in))
	continue;
	break;

	case nir_intrinsic_load_output:
	case nir_intrinsic_load_per_vertex_output:
	case nir_intrinsic_load_per_view_output:
	case nir_intrinsic_load_per_primitive_output:
	case nir_intrinsic_store_output:
	case nir_intrinsic_store_per_vertex_output:
	case nir_intrinsic_store_per_view_output:
	case nir_intrinsic_store_per_primitive_output:
	if (!(modes & nir_var_shader_out))
	continue;

	/* Break the batch if an output load is followed by an output
	* store to the same channel and vice versa.
	*/
	if (BITSET_TEST(is_load ? has_output_stores : has_output_loads,
	index)) {
	progress \|= vectorize_batch(&io_instructions, allow_holes);
	BITSET_ZERO(has_output_loads);
	BITSET_ZERO(has_output_stores);
	}
	is_output = true;
	break;

	case nir_intrinsic_barrier:
	/* Don't vectorize across TCS barriers. */
	if (modes & nir_var_shader_out &&
	nir_intrinsic_memory_modes(intr) & nir_var_shader_out) {
	progress \|= vectorize_batch(&io_instructions, allow_holes);
	BITSET_ZERO(has_output_loads);
	BITSET_ZERO(has_output_stores);
	}
	continue;

	case nir_intrinsic_emit_vertex:
	/* Don't vectorize across GS emits. */
	progress \|= vectorize_batch(&io_instructions, allow_holes);
	BITSET_ZERO(has_output_loads);
	BITSET_ZERO(has_output_stores);
	continue;

	default:
	continue;
	}

	/* Only scalar 16 and 32-bit instructions are allowed. */
	ASSERTED nir_def *value = is_load ? &intr->def : intr->src[0].ssa;
	assert(value->num_components == 1);
	assert(value->bit_size == 16 \|\| value->bit_size == 32);

	util_dynarray_append(&io_instructions, void *, intr);
	if (is_output)
	BITSET_SET(is_load ? has_output_loads : has_output_stores, index);
	}

	progress \|= vectorize_batch(&io_instructions, allow_holes);
	}

	nir_progress(progress, impl,
	nir_metadata_block_index \| nir_metadata_dominance);
	global_progress \|= progress;
	}
	util_dynarray_fini(&io_instructions);

	return global_progress;
	}