src/intel/compiler/brw_opt.cpp - third_party/mesa - Git at Google

 /*
  * Copyright © 2010 Intel Corporation
  * SPDX-License-Identifier: MIT
  */

 #include "brw_eu.h"
 #include "brw_shader.h"
 #include "brw_builder.h"

 #include "dev/intel_debug.h"

 void
 brw_optimize(brw_shader &s)
 {
    const nir_shader *nir = s.nir;

    s.debug_optimizer(nir, "start", 0, 0);

    /* Start by validating the shader we currently have. */
    brw_validate(s);

    bool progress = false;
    int iteration = 0;
    int pass_num = 0;

 #define OPT(pass, ...) ({                                               \
       pass_num++;                                                       \
       bool this_progress = pass(s, ##__VA_ARGS__);                      \
                                                                         \
       if (this_progress)                                                \
          s.debug_optimizer(nir, #pass, iteration, pass_num);            \
                                                                         \
       brw_validate(s);                                                  \
                                                                         \
       progress = progress || this_progress;                             \
       this_progress;                                                    \
    })

    if (s.compiler->lower_dpas)
       OPT(brw_lower_dpas);

    OPT(brw_opt_split_virtual_grfs);

    /* Before anything else, eliminate dead code.  The results of some NIR
     * instructions may effectively be calculated twice.  Once when the
     * instruction is encountered, and again when the user of that result is
     * encountered.  Wipe those away before algebraic optimizations and
     * especially copy propagation can mix things up.
     */
    OPT(brw_opt_dead_code_eliminate);

    OPT(brw_opt_remove_extra_rounding_modes);

    OPT(brw_opt_eliminate_find_live_channel);

    /* Add load_reg instructions before the main optimization loop to get more
     * defs available in those passes. Do it after the preceeding few pre-loop
     * passes so that it hopefully has less work to do. Having it here versus
     * before the call to opt_dce made some difference, but it was mostly
     * noise.
     */
    OPT(brw_insert_load_reg);

    /* Track how much non-SSA at this point. */
    {
       const brw_def_analysis &defs = s.def_analysis.require();
       s.shader_stats.non_ssa_registers_after_nir =
          defs.count() - defs.ssa_count();
    }

    do {
       progress = false;
       pass_num = 0;
       iteration++;

       OPT(brw_opt_algebraic);
       OPT(brw_opt_cse_defs);
       OPT(brw_opt_copy_propagation_defs);
       OPT(brw_opt_cmod_propagation);
       OPT(brw_opt_dead_code_eliminate);
       OPT(brw_opt_saturate_propagation);
       OPT(brw_opt_register_coalesce);

       OPT(brw_opt_compact_virtual_grfs);
    } while (progress);

    brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_OPT_LOOP);

    progress = false;
    pass_num = 0;

    if (OPT(brw_opt_combine_convergent_txf))
       OPT(brw_opt_copy_propagation_defs);

    if (OPT(brw_lower_load_reg)) {
       OPT(brw_opt_copy_propagation);
       OPT(brw_opt_register_coalesce);
       OPT(brw_opt_dead_code_eliminate);
    }

    if (OPT(brw_lower_pack)) {
       OPT(brw_opt_register_coalesce);
       OPT(brw_opt_dead_code_eliminate);
    }

    OPT(brw_lower_subgroup_ops);
    OPT(brw_lower_csel);
    OPT(brw_lower_simd_width);
    OPT(brw_lower_scalar_fp64_MAD);
    OPT(brw_lower_barycentrics);
    OPT(brw_lower_logical_sends);

    brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_EARLY_LOWERING);

    /* After logical SEND lowering. */

    if (!OPT(brw_opt_copy_propagation_defs))
       OPT(brw_opt_copy_propagation);

    /* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
     * Do this before splitting SENDs.
     */
    if (OPT(brw_opt_zero_samples)) {
       if (!OPT(brw_opt_copy_propagation_defs)) {
          OPT(brw_opt_copy_propagation);
       }
    }

    if (s.devinfo->ver >= 30)
       OPT(brw_opt_send_to_send_gather);

    OPT(brw_opt_split_sends);
    OPT(brw_workaround_nomask_control_flow);

    if (progress) {
       /* Do both forms of copy propagation because it is important to
        * eliminate as many cases of load_payload-of-load_payload as possible.
        */
       OPT(brw_opt_copy_propagation_defs);
       OPT(brw_opt_copy_propagation);

       /* Run after logical send lowering to give it a chance to CSE the
        * LOAD_PAYLOAD instructions created to construct the payloads of
        * e.g. texturing messages in cases where it wasn't possible to CSE the
        * whole logical instruction.
        */
       OPT(brw_opt_cse_defs);
       OPT(brw_opt_register_coalesce);
       OPT(brw_opt_dead_code_eliminate);
    }

    OPT(brw_opt_remove_redundant_halts);

    if (OPT(brw_lower_load_payload)) {
       OPT(brw_opt_split_virtual_grfs);

       OPT(brw_opt_register_coalesce);
       OPT(brw_lower_simd_width);
       OPT(brw_opt_dead_code_eliminate);
    }

    brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_MIDDLE_LOWERING);

    OPT(brw_opt_combine_constants);
    if (OPT(brw_lower_integer_multiplication)) {
       /* If lower_integer_multiplication made progress, it may have produced
        * some 32x32-bit MULs in the process of lowering 64-bit MULs.  Run it
        * one more time to clean those up if they exist.
        */
       OPT(brw_lower_integer_multiplication);
    }
    OPT(brw_lower_sub_sat);

    progress = false;
    OPT(brw_lower_derivatives);
    OPT(brw_lower_regioning);

    /* Try both copy propagation passes.  The defs one will likely not be
     * able to handle everything at this point.
     */
    const bool cp1 = OPT(brw_opt_copy_propagation_defs);
    const bool cp2 = OPT(brw_opt_copy_propagation);
    if (cp1 || cp2)
       OPT(brw_opt_combine_constants);

    OPT(brw_opt_dead_code_eliminate);
    OPT(brw_opt_register_coalesce);

    if (progress)
       OPT(brw_lower_simd_width);

    if (s.devinfo->ver >= 30)
       OPT(brw_opt_send_gather_to_send);

    OPT(brw_lower_uniform_pull_constant_loads);

    /* Do this before brw_lower_send_descriptors. */
    OPT(brw_workaround_memory_fence_before_eot);

    if (OPT(brw_lower_send_descriptors)) {
       /* No need for standard copy_propagation since
        * brw_opt_address_reg_load will only optimize defs.
        */
       OPT(brw_opt_copy_propagation_defs);
       OPT(brw_opt_algebraic);
       OPT(brw_opt_address_reg_load);
       OPT(brw_opt_dead_code_eliminate);
    }

    OPT(brw_lower_sends_overlapping_payload);

    OPT(brw_lower_indirect_mov);

    OPT(brw_lower_alu_restrictions);

    OPT(brw_lower_find_live_channel);

    OPT(brw_lower_load_subgroup_invocation);

    brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_LATE_LOWERING);
 }

 static unsigned
 load_payload_sources_read_for_size(brw_inst *lp, unsigned size_read)
 {
    assert(lp->opcode == SHADER_OPCODE_LOAD_PAYLOAD);
    assert(size_read >= lp->header_size * REG_SIZE);

    unsigned i;
    unsigned size = lp->header_size * REG_SIZE;
    for (i = lp->header_size; size < size_read && i < lp->sources; i++)
       size += lp->exec_size * brw_type_size_bytes(lp->src[i].type);

    /* Size read must cover exactly a subset of sources. */
    assert(size == size_read);
    return i;
 }

 /**
  * Optimize sample messages that have constant zero values for the trailing
  * parameters. We can just reduce the message length for these
  * instructions instead of reserving a register for it. Trailing parameters
  * that aren't sent default to zero anyway. This will cause the dead code
  * eliminator to remove the MOV instruction that would otherwise be emitted to
  * set up the zero value.
  */

 bool
 brw_opt_zero_samples(brw_shader &s)
 {
    bool progress = false;

    foreach_block_and_inst(block, brw_inst, send, s.cfg) {
       if (send->opcode != SHADER_OPCODE_SEND ||
           send->sfid != BRW_SFID_SAMPLER)
          continue;

       /* Wa_14012688258:
        *
        * Don't trim zeros at the end of payload for sample operations
        * in cube and cube arrays.
        */
       if (send->keep_payload_trailing_zeros)
          continue;

       /* This pass works on SENDs before splitting. */
       if (send->ex_mlen > 0)
          continue;

       brw_inst *lp = (brw_inst *) send->prev;

       if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
          continue;

       /* How much of the payload are actually read by this SEND. */
       const unsigned params =
          load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);

       /* We don't want to remove the message header or the first parameter.
        * Removing the first parameter is not allowed, see the Haswell PRM
        * volume 7, page 149:
        *
        *     "Parameter 0 is required except for the sampleinfo message, which
        *      has no parameter 0"
        */
       const unsigned first_param_idx = lp->header_size;
       unsigned zero_size = 0;
       for (unsigned i = params - 1; i > first_param_idx; i--) {
          if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
             break;
          zero_size += lp->exec_size * brw_type_size_bytes(lp->src[i].type) * lp->dst.stride;
       }

       /* Round down to ensure to only consider full registers. */
       const unsigned zero_len = ROUND_DOWN_TO(zero_size / REG_SIZE, reg_unit(s.devinfo));
       if (zero_len > 0) {
          /* Note mlen is in REG_SIZE units. */
          send->mlen -= zero_len;
          progress = true;
       }
    }

    if (progress)
       s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DETAIL);

    return progress;
 }

 /**
  * Opportunistically split SEND message payloads.
  *
  * Gfx9+ supports "split" SEND messages, which take two payloads that are
  * implicitly concatenated.  If we find a SEND message with a single payload,
  * we can split that payload in two.  This results in smaller contiguous
  * register blocks for us to allocate.  But it can help beyond that, too.
  *
  * We try and split a LOAD_PAYLOAD between sources which change registers.
  * For example, a sampler message often contains a x/y/z coordinate that may
  * already be in a contiguous VGRF, combined with an LOD, shadow comparitor,
  * or array index, which comes from elsewhere.  In this case, the first few
  * sources will be different offsets of the same VGRF, then a later source
  * will be a different VGRF.  So we split there, possibly eliminating the
  * payload concatenation altogether.
  */
 bool
 brw_opt_split_sends(brw_shader &s)
 {
    bool progress = false;

    foreach_block_and_inst(block, brw_inst, send, s.cfg) {
       if (send->opcode != SHADER_OPCODE_SEND ||
           send->mlen <= reg_unit(s.devinfo) || send->ex_mlen > 0 ||
           send->src[2].file != VGRF)
          continue;

       /* Currently don't split sends that reuse a previously used payload. */
       brw_inst *lp = (brw_inst *) send->prev;

       if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
          continue;

       if (lp->dst.file != send->src[2].file || lp->dst.nr != send->src[2].nr)
          continue;

       /* Split either after the header (if present), or when consecutive
        * sources switch from one VGRF to a different one.
        */
       unsigned mid = lp->header_size;
       if (mid == 0) {
          for (mid = 1; mid < lp->sources; mid++) {
             if (lp->src[mid].file == BAD_FILE)
                continue;

             if (lp->src[0].file != lp->src[mid].file ||
                 lp->src[0].nr != lp->src[mid].nr)
                break;
          }
       }

       /* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so
        * find out how many sources from the payload does it really need.
        */
       const unsigned end =
          load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);

       /* Nothing to split. */
       if (end <= mid)
          continue;

       const brw_builder ibld(lp);
       brw_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size);
       brw_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0);

       assert(lp1->size_written % REG_SIZE == 0);
       assert(lp2->size_written % REG_SIZE == 0);
       assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen);

       lp1->dst = retype(brw_allocate_vgrf_units(s, lp1->size_written / REG_SIZE), lp1->dst.type);
       lp2->dst = retype(brw_allocate_vgrf_units(s, lp2->size_written / REG_SIZE), lp2->dst.type);

       send->resize_sources(4);
       send->src[2] = lp1->dst;
       send->src[3] = lp2->dst;
       send->ex_mlen = lp2->size_written / REG_SIZE;
       send->mlen -= send->ex_mlen;

       progress = true;
    }

    if (progress)
       s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
                             BRW_DEPENDENCY_VARIABLES);

    return progress;
 }

 /**
  * Remove redundant or useless halts.
  *
  * For example, we can eliminate halts in the following sequence:
  *
  * halt        (redundant with the next halt)
  * halt        (useless; jumps to the next instruction)
  * halt-target
  */
 bool
 brw_opt_remove_redundant_halts(brw_shader &s)
 {
    bool progress = false;

    unsigned halt_count = 0;
    brw_inst *halt_target = NULL;
    foreach_block_and_inst(block, brw_inst, inst, s.cfg) {
       if (inst->opcode == BRW_OPCODE_HALT)
          halt_count++;

       if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
          halt_target = inst;
          break;
       }
    }

    if (!halt_target) {
       assert(halt_count == 0);
       return false;
    }

    /* Delete any HALTs immediately before the halt target. */
    for (brw_inst *prev = (brw_inst *) halt_target->prev;
         !prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
         prev = (brw_inst *) halt_target->prev) {
       prev->remove();
       halt_count--;
       progress = true;
    }

    if (halt_count == 0) {
       halt_target->remove();
       progress = true;
    }

    if (progress)
       s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);

    return progress;
 }

 /**
  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
  * flow.  We could probably do better here with some form of divergence
  * analysis.
  */
 bool
 brw_opt_eliminate_find_live_channel(brw_shader &s)
 {
    bool progress = false;
    unsigned depth = 0;

    if (!brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
                                       s.prog_data)) {
       /* The optimization below assumes that channel zero is live on thread
        * dispatch, which may not be the case if the fixed function dispatches
        * threads sparsely.
        */
       return false;
    }

    foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
       switch (inst->opcode) {
       case BRW_OPCODE_IF:
       case BRW_OPCODE_DO:
          depth++;
          break;

       case BRW_OPCODE_ENDIF:
       case BRW_OPCODE_WHILE:
          depth--;
          break;

       case BRW_OPCODE_HALT:
          /* This can potentially make control flow non-uniform until the end
           * of the program.
           */
          goto out;

       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
          if (depth == 0) {
             inst->opcode = BRW_OPCODE_MOV;
             inst->src[0] = brw_imm_ud(0u);
             inst->force_writemask_all = true;

             /* FIND_LIVE_CHANNEL emitted by emit_uniformize will have
              * size_written set by hand to a smaller value. In this case,
              * munge the exec_size to match.
              */
             if (inst->size_written == inst->dst.component_size(8 * reg_unit(s.devinfo)))
                inst->exec_size = 8 * reg_unit(s.devinfo);

             inst->resize_sources(1);
             progress = true;

             /* emit_uniformize() frequently emits FIND_LIVE_CHANNEL paired
              * with a BROADCAST.  Save some work for opt_copy_propagation
              * and opt_algebraic by trivially cleaning up both together.
              */
             assert(!inst->next->is_tail_sentinel());
             brw_inst *bcast = (brw_inst *) inst->next;

             /* Ignore stride when comparing */
             if (bcast->opcode == SHADER_OPCODE_BROADCAST &&
                 inst->dst.file == VGRF &&
                 inst->dst.file == bcast->src[1].file &&
                 inst->dst.nr == bcast->src[1].nr &&
                 inst->dst.offset == bcast->src[1].offset) {
                bcast->opcode = BRW_OPCODE_MOV;
                if (!is_uniform(bcast->src[0]))
                   bcast->src[0] = component(bcast->src[0], 0);

                bcast->force_writemask_all = true;
                bcast->exec_size = 8 * reg_unit(s.devinfo);
                assert(bcast->size_written == bcast->dst.component_size(bcast->exec_size));
                bcast->resize_sources(1);
             }
          }
          break;

       default:
          break;
       }
    }

 out:
    if (progress)
       s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DATA_FLOW |
                             BRW_DEPENDENCY_INSTRUCTION_DETAIL);

    return progress;
 }

 /**
  * Rounding modes for conversion instructions are included for each
  * conversion, but right now it is a state. So once it is set,
  * we don't need to call it again for subsequent calls.
  *
  * This is useful for vector/matrices conversions, as setting the
  * mode once is enough for the full vector/matrix
  */
 bool
 brw_opt_remove_extra_rounding_modes(brw_shader &s)
 {
    bool progress = false;
    unsigned execution_mode = s.nir->info.float_controls_execution_mode;

    brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
    if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
        execution_mode)
       base_mode = BRW_RND_MODE_RTNE;
    if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
         FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
         FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
        execution_mode)
       base_mode = BRW_RND_MODE_RTZ;

    foreach_block (block, s.cfg) {
       brw_rnd_mode prev_mode = base_mode;

       foreach_inst_in_block_safe (brw_inst, inst, block) {
          if (inst->opcode == SHADER_OPCODE_RND_MODE) {
             assert(inst->src[0].file == IMM);
             const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
             if (mode == prev_mode) {
                inst->remove();
                progress = true;
             } else {
                prev_mode = mode;
             }
          }
       }
    }

    if (progress)
       s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);

    return progress;
 }

 bool
 brw_opt_send_to_send_gather(brw_shader &s)
 {
    const intel_device_info *devinfo = s.devinfo;
    bool progress = false;

    assert(devinfo->ver >= 30);

    const unsigned unit = reg_unit(devinfo);
    assert(unit == 2);

    unsigned count = 0;

    foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
       if (inst->opcode != SHADER_OPCODE_SEND)
          continue;

       /* For 1-2 registers, send-gather offers no benefits over split-send. */
       if (inst->mlen + inst->ex_mlen <= 2 * unit)
          continue;

       assert(inst->mlen % unit == 0);
       assert(inst->ex_mlen % unit == 0);

       struct {
          brw_reg src;
          unsigned phys_len;
       } payload[2] = {
          { inst->src[2], inst->mlen / unit },
          { inst->src[3], inst->ex_mlen / unit },
       };

       const unsigned num_payload_sources = payload[0].phys_len + payload[1].phys_len;

       /* Limited by Src0.Length in the SEND instruction. */
       if (num_payload_sources > 15)
          continue;

       if (INTEL_DEBUG(DEBUG_NO_SEND_GATHER)) {
          count++;
          continue;
       }

       inst->resize_sources(3 + num_payload_sources);
       /* Sources 0 and 1 remain the same.  Source 2 will be filled
        * after register allocation.
        */
       inst->src[2] = {};

       int idx = 3;
       for (unsigned p = 0; p < ARRAY_SIZE(payload); p++) {
          for (unsigned i = 0; i < payload[p].phys_len; i++) {
             inst->src[idx++] = byte_offset(payload[p].src,
                                            i * reg_unit(devinfo) * REG_SIZE);
          }
       }
       assert(idx == inst->sources);

       inst->opcode = SHADER_OPCODE_SEND_GATHER;
       inst->mlen = 0;
       inst->ex_mlen = 0;

       progress = true;
    }

    if (INTEL_DEBUG(DEBUG_NO_SEND_GATHER)) {
       fprintf(stderr, "Ignored %u opportunities to try SEND_GATHER in %s shader.\n",
               count, _mesa_shader_stage_to_string(s.stage));
    }

    if (progress)
       s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DETAIL |
                             BRW_DEPENDENCY_INSTRUCTION_DATA_FLOW);

    return progress;
 }

 /* If after optimizations, the sources are *still* contiguous in a
  * SEND_GATHER, prefer to use the regular SEND, which would save
  * having to write the ARF scalar register.
  */
 bool
 brw_opt_send_gather_to_send(brw_shader &s)
 {
    const intel_device_info *devinfo = s.devinfo;
    bool progress = false;

    assert(devinfo->ver >= 30);

    const unsigned unit = reg_unit(devinfo);
    assert(unit == 2);

    foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
       if (inst->opcode != SHADER_OPCODE_SEND_GATHER)
          continue;

       assert(inst->sources > 2);
       assert(inst->src[2].file == BAD_FILE);

       const int num_payload_sources = inst->sources - 3;
       assert(num_payload_sources > 0);

       /* Limited by Src0.Length in the SEND instruction. */
       assert(num_payload_sources < 16);

       /* Determine whether the sources are still spread in either one or two
        * spans.  In those cases the regular SEND instruction can be used
        * and there's no need to use SEND_GATHER (which would set ARF scalar register
        * adding an extra instruction).
        */
       const brw_reg *payload = &inst->src[3];
       brw_reg payload1       = payload[0];
       brw_reg payload2       = {};
       int payload1_len       = 0;
       int payload2_len       = 0;

       for (int i = 0; i < num_payload_sources; i++) {
          if (payload[i].file == VGRF &&
              payload[i].nr == payload1.nr &&
              payload[i].offset == payload1_len * REG_SIZE * unit)
             payload1_len++;
          else {
             payload2 = payload[i];
             break;
          }
       }

       if (payload2.file == VGRF) {
          for (int i = payload1_len; i < num_payload_sources; i++) {
             if (payload[i].file == VGRF &&
                 payload[i].nr == payload2.nr &&
                 payload[i].offset == payload2_len * REG_SIZE * unit)
                payload2_len++;
             else
                break;
          }
       } else {
          payload2 = brw_null_reg();
       }

       if (payload1_len + payload2_len != num_payload_sources)
          continue;

       /* Bspec 57058 (r64705) says
        *
        *    When a source data payload is used in dataport message, that payload
        *    must be specified as Source 1 portion of a Split Send message.
        *
        * But at this point the split point is not guaranteed to respect that.
        *
        * TODO: Pass LSC address length or infer it so valid splits can work.
        */
       if (payload2_len && (inst->sfid == BRW_SFID_UGM ||
                            inst->sfid == BRW_SFID_TGM ||
                            inst->sfid == BRW_SFID_SLM ||
                            inst->sfid == BRW_SFID_URB)) {
          enum lsc_opcode lsc_op = lsc_msg_desc_opcode(devinfo, inst->desc);
          if (lsc_op_num_data_values(lsc_op) > 0)
             continue;
       }

       inst->resize_sources(4);
       inst->opcode  = SHADER_OPCODE_SEND;
       inst->src[2]  = payload1;
       inst->src[3]  = payload2;
       inst->mlen    = payload1_len * unit;
       inst->ex_mlen = payload2_len * unit;

       progress = true;
    }

    if (progress) {
       s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DETAIL |
                             BRW_DEPENDENCY_INSTRUCTION_DATA_FLOW);
    }

    return progress;
 }
	/*
	* Copyright © 2010 Intel Corporation
	* SPDX-License-Identifier: MIT
	*/

	#include "brw_eu.h"
	#include "brw_shader.h"
	#include "brw_builder.h"

	#include "dev/intel_debug.h"

	void
	brw_optimize(brw_shader &s)
	{
	const nir_shader *nir = s.nir;

	s.debug_optimizer(nir, "start", 0, 0);

	/* Start by validating the shader we currently have. */
	brw_validate(s);

	bool progress = false;
	int iteration = 0;
	int pass_num = 0;

	#define OPT(pass, ...) ({ \
	pass_num++; \
	bool this_progress = pass(s, ##__VA_ARGS__); \
	\
	if (this_progress) \
	s.debug_optimizer(nir, #pass, iteration, pass_num); \
	\
	brw_validate(s); \
	\
	progress = progress \|\| this_progress; \
	this_progress; \
	})

	if (s.compiler->lower_dpas)
	OPT(brw_lower_dpas);

	OPT(brw_opt_split_virtual_grfs);

	/* Before anything else, eliminate dead code. The results of some NIR
	* instructions may effectively be calculated twice. Once when the
	* instruction is encountered, and again when the user of that result is
	* encountered. Wipe those away before algebraic optimizations and
	* especially copy propagation can mix things up.
	*/
	OPT(brw_opt_dead_code_eliminate);

	OPT(brw_opt_remove_extra_rounding_modes);

	OPT(brw_opt_eliminate_find_live_channel);

	/* Add load_reg instructions before the main optimization loop to get more
	* defs available in those passes. Do it after the preceeding few pre-loop
	* passes so that it hopefully has less work to do. Having it here versus
	* before the call to opt_dce made some difference, but it was mostly
	* noise.
	*/
	OPT(brw_insert_load_reg);

	/* Track how much non-SSA at this point. */
	{
	const brw_def_analysis &defs = s.def_analysis.require();
	s.shader_stats.non_ssa_registers_after_nir =
	defs.count() - defs.ssa_count();
	}

	do {
	progress = false;
	pass_num = 0;
	iteration++;

	OPT(brw_opt_algebraic);
	OPT(brw_opt_cse_defs);
	OPT(brw_opt_copy_propagation_defs);
	OPT(brw_opt_cmod_propagation);
	OPT(brw_opt_dead_code_eliminate);
	OPT(brw_opt_saturate_propagation);
	OPT(brw_opt_register_coalesce);

	OPT(brw_opt_compact_virtual_grfs);
	} while (progress);

	brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_OPT_LOOP);

	progress = false;
	pass_num = 0;

	if (OPT(brw_opt_combine_convergent_txf))
	OPT(brw_opt_copy_propagation_defs);

	if (OPT(brw_lower_load_reg)) {
	OPT(brw_opt_copy_propagation);
	OPT(brw_opt_register_coalesce);
	OPT(brw_opt_dead_code_eliminate);
	}

	if (OPT(brw_lower_pack)) {
	OPT(brw_opt_register_coalesce);
	OPT(brw_opt_dead_code_eliminate);
	}

	OPT(brw_lower_subgroup_ops);
	OPT(brw_lower_csel);
	OPT(brw_lower_simd_width);
	OPT(brw_lower_scalar_fp64_MAD);
	OPT(brw_lower_barycentrics);
	OPT(brw_lower_logical_sends);

	brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_EARLY_LOWERING);

	/* After logical SEND lowering. */

	if (!OPT(brw_opt_copy_propagation_defs))
	OPT(brw_opt_copy_propagation);

	/* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
	* Do this before splitting SENDs.
	*/
	if (OPT(brw_opt_zero_samples)) {
	if (!OPT(brw_opt_copy_propagation_defs)) {
	OPT(brw_opt_copy_propagation);
	}
	}

	if (s.devinfo->ver >= 30)
	OPT(brw_opt_send_to_send_gather);

	OPT(brw_opt_split_sends);
	OPT(brw_workaround_nomask_control_flow);

	if (progress) {
	/* Do both forms of copy propagation because it is important to
	* eliminate as many cases of load_payload-of-load_payload as possible.
	*/
	OPT(brw_opt_copy_propagation_defs);
	OPT(brw_opt_copy_propagation);

	/* Run after logical send lowering to give it a chance to CSE the
	* LOAD_PAYLOAD instructions created to construct the payloads of
	* e.g. texturing messages in cases where it wasn't possible to CSE the
	* whole logical instruction.
	*/
	OPT(brw_opt_cse_defs);
	OPT(brw_opt_register_coalesce);
	OPT(brw_opt_dead_code_eliminate);
	}

	OPT(brw_opt_remove_redundant_halts);

	if (OPT(brw_lower_load_payload)) {
	OPT(brw_opt_split_virtual_grfs);

	OPT(brw_opt_register_coalesce);
	OPT(brw_lower_simd_width);
	OPT(brw_opt_dead_code_eliminate);
	}

	brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_MIDDLE_LOWERING);

	OPT(brw_opt_combine_constants);
	if (OPT(brw_lower_integer_multiplication)) {
	/* If lower_integer_multiplication made progress, it may have produced
	* some 32x32-bit MULs in the process of lowering 64-bit MULs. Run it
	* one more time to clean those up if they exist.
	*/
	OPT(brw_lower_integer_multiplication);
	}
	OPT(brw_lower_sub_sat);

	progress = false;
	OPT(brw_lower_derivatives);
	OPT(brw_lower_regioning);

	/* Try both copy propagation passes. The defs one will likely not be
	* able to handle everything at this point.
	*/
	const bool cp1 = OPT(brw_opt_copy_propagation_defs);
	const bool cp2 = OPT(brw_opt_copy_propagation);
	if (cp1 \|\| cp2)
	OPT(brw_opt_combine_constants);

	OPT(brw_opt_dead_code_eliminate);
	OPT(brw_opt_register_coalesce);

	if (progress)
	OPT(brw_lower_simd_width);

	if (s.devinfo->ver >= 30)
	OPT(brw_opt_send_gather_to_send);

	OPT(brw_lower_uniform_pull_constant_loads);

	/* Do this before brw_lower_send_descriptors. */
	OPT(brw_workaround_memory_fence_before_eot);

	if (OPT(brw_lower_send_descriptors)) {
	/* No need for standard copy_propagation since
	* brw_opt_address_reg_load will only optimize defs.
	*/
	OPT(brw_opt_copy_propagation_defs);
	OPT(brw_opt_algebraic);
	OPT(brw_opt_address_reg_load);
	OPT(brw_opt_dead_code_eliminate);
	}

	OPT(brw_lower_sends_overlapping_payload);

	OPT(brw_lower_indirect_mov);

	OPT(brw_lower_alu_restrictions);

	OPT(brw_lower_find_live_channel);

	OPT(brw_lower_load_subgroup_invocation);

	brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_LATE_LOWERING);
	}

	static unsigned
	load_payload_sources_read_for_size(brw_inst *lp, unsigned size_read)
	{
	assert(lp->opcode == SHADER_OPCODE_LOAD_PAYLOAD);
	assert(size_read >= lp->header_size * REG_SIZE);

	unsigned i;
	unsigned size = lp->header_size * REG_SIZE;
	for (i = lp->header_size; size < size_read && i < lp->sources; i++)
	size += lp->exec_size * brw_type_size_bytes(lp->src[i].type);

	/* Size read must cover exactly a subset of sources. */
	assert(size == size_read);
	return i;
	}

	/**
	* Optimize sample messages that have constant zero values for the trailing
	* parameters. We can just reduce the message length for these
	* instructions instead of reserving a register for it. Trailing parameters
	* that aren't sent default to zero anyway. This will cause the dead code
	* eliminator to remove the MOV instruction that would otherwise be emitted to
	* set up the zero value.
	*/

	bool
	brw_opt_zero_samples(brw_shader &s)
	{
	bool progress = false;

	foreach_block_and_inst(block, brw_inst, send, s.cfg) {
	if (send->opcode != SHADER_OPCODE_SEND \|\|
	send->sfid != BRW_SFID_SAMPLER)
	continue;

	/* Wa_14012688258:
	*
	* Don't trim zeros at the end of payload for sample operations
	* in cube and cube arrays.
	*/
	if (send->keep_payload_trailing_zeros)
	continue;

	/* This pass works on SENDs before splitting. */
	if (send->ex_mlen > 0)
	continue;

	brw_inst lp = (brw_inst ) send->prev;

	if (lp->is_head_sentinel() \|\| lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
	continue;

	/* How much of the payload are actually read by this SEND. */
	const unsigned params =
	load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);

	/* We don't want to remove the message header or the first parameter.
	* Removing the first parameter is not allowed, see the Haswell PRM
	* volume 7, page 149:
	*
	* "Parameter 0 is required except for the sampleinfo message, which
	* has no parameter 0"
	*/
	const unsigned first_param_idx = lp->header_size;
	unsigned zero_size = 0;
	for (unsigned i = params - 1; i > first_param_idx; i--) {
	if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
	break;
	zero_size += lp->exec_size * brw_type_size_bytes(lp->src[i].type) * lp->dst.stride;
	}

	/* Round down to ensure to only consider full registers. */
	const unsigned zero_len = ROUND_DOWN_TO(zero_size / REG_SIZE, reg_unit(s.devinfo));
	if (zero_len > 0) {
	/* Note mlen is in REG_SIZE units. */
	send->mlen -= zero_len;
	progress = true;
	}
	}

	if (progress)
	s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DETAIL);

	return progress;
	}

	/**
	* Opportunistically split SEND message payloads.
	*
	* Gfx9+ supports "split" SEND messages, which take two payloads that are
	* implicitly concatenated. If we find a SEND message with a single payload,
	* we can split that payload in two. This results in smaller contiguous
	* register blocks for us to allocate. But it can help beyond that, too.
	*
	* We try and split a LOAD_PAYLOAD between sources which change registers.
	* For example, a sampler message often contains a x/y/z coordinate that may
	* already be in a contiguous VGRF, combined with an LOD, shadow comparitor,
	* or array index, which comes from elsewhere. In this case, the first few
	* sources will be different offsets of the same VGRF, then a later source
	* will be a different VGRF. So we split there, possibly eliminating the
	* payload concatenation altogether.
	*/
	bool
	brw_opt_split_sends(brw_shader &s)
	{
	bool progress = false;

	foreach_block_and_inst(block, brw_inst, send, s.cfg) {
	if (send->opcode != SHADER_OPCODE_SEND \|\|
	send->mlen <= reg_unit(s.devinfo) \|\| send->ex_mlen > 0 \|\|
	send->src[2].file != VGRF)
	continue;

	/* Currently don't split sends that reuse a previously used payload. */
	brw_inst lp = (brw_inst ) send->prev;

	if (lp->is_head_sentinel() \|\| lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
	continue;

	if (lp->dst.file != send->src[2].file \|\| lp->dst.nr != send->src[2].nr)
	continue;

	/* Split either after the header (if present), or when consecutive
	* sources switch from one VGRF to a different one.
	*/
	unsigned mid = lp->header_size;
	if (mid == 0) {
	for (mid = 1; mid < lp->sources; mid++) {
	if (lp->src[mid].file == BAD_FILE)
	continue;

	if (lp->src[0].file != lp->src[mid].file \|\|
	lp->src[0].nr != lp->src[mid].nr)
	break;
	}
	}

	/* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so
	* find out how many sources from the payload does it really need.
	*/
	const unsigned end =
	load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);

	/* Nothing to split. */
	if (end <= mid)
	continue;

	const brw_builder ibld(lp);
	brw_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size);
	brw_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0);

	assert(lp1->size_written % REG_SIZE == 0);
	assert(lp2->size_written % REG_SIZE == 0);
	assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen);

	lp1->dst = retype(brw_allocate_vgrf_units(s, lp1->size_written / REG_SIZE), lp1->dst.type);
	lp2->dst = retype(brw_allocate_vgrf_units(s, lp2->size_written / REG_SIZE), lp2->dst.type);

	send->resize_sources(4);
	send->src[2] = lp1->dst;
	send->src[3] = lp2->dst;
	send->ex_mlen = lp2->size_written / REG_SIZE;
	send->mlen -= send->ex_mlen;

	progress = true;
	}

	if (progress)
	s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS \|
	BRW_DEPENDENCY_VARIABLES);

	return progress;
	}

	/**
	* Remove redundant or useless halts.
	*
	* For example, we can eliminate halts in the following sequence:
	*
	* halt (redundant with the next halt)
	* halt (useless; jumps to the next instruction)
	* halt-target
	*/
	bool
	brw_opt_remove_redundant_halts(brw_shader &s)
	{
	bool progress = false;

	unsigned halt_count = 0;
	brw_inst *halt_target = NULL;
	foreach_block_and_inst(block, brw_inst, inst, s.cfg) {
	if (inst->opcode == BRW_OPCODE_HALT)
	halt_count++;

	if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
	halt_target = inst;
	break;
	}
	}

	if (!halt_target) {
	assert(halt_count == 0);
	return false;
	}

	/* Delete any HALTs immediately before the halt target. */
	for (brw_inst prev = (brw_inst ) halt_target->prev;
	!prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
	prev = (brw_inst *) halt_target->prev) {
	prev->remove();
	halt_count--;
	progress = true;
	}

	if (halt_count == 0) {
	halt_target->remove();
	progress = true;
	}

	if (progress)
	s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);

	return progress;
	}

	/**
	* Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
	* flow. We could probably do better here with some form of divergence
	* analysis.
	*/
	bool
	brw_opt_eliminate_find_live_channel(brw_shader &s)
	{
	bool progress = false;
	unsigned depth = 0;

	if (!brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
	s.prog_data)) {
	/* The optimization below assumes that channel zero is live on thread
	* dispatch, which may not be the case if the fixed function dispatches
	* threads sparsely.
	*/
	return false;
	}

	foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
	switch (inst->opcode) {
	case BRW_OPCODE_IF:
	case BRW_OPCODE_DO:
	depth++;
	break;

	case BRW_OPCODE_ENDIF:
	case BRW_OPCODE_WHILE:
	depth--;
	break;

	case BRW_OPCODE_HALT:
	/* This can potentially make control flow non-uniform until the end
	* of the program.
	*/
	goto out;

	case SHADER_OPCODE_FIND_LIVE_CHANNEL:
	if (depth == 0) {
	inst->opcode = BRW_OPCODE_MOV;
	inst->src[0] = brw_imm_ud(0u);
	inst->force_writemask_all = true;

	/* FIND_LIVE_CHANNEL emitted by emit_uniformize will have
	* size_written set by hand to a smaller value. In this case,
	* munge the exec_size to match.
	*/
	if (inst->size_written == inst->dst.component_size(8 * reg_unit(s.devinfo)))
	inst->exec_size = 8 * reg_unit(s.devinfo);

	inst->resize_sources(1);
	progress = true;

	/* emit_uniformize() frequently emits FIND_LIVE_CHANNEL paired
	* with a BROADCAST. Save some work for opt_copy_propagation
	* and opt_algebraic by trivially cleaning up both together.
	*/
	assert(!inst->next->is_tail_sentinel());
	brw_inst bcast = (brw_inst ) inst->next;

	/* Ignore stride when comparing */
	if (bcast->opcode == SHADER_OPCODE_BROADCAST &&
	inst->dst.file == VGRF &&
	inst->dst.file == bcast->src[1].file &&
	inst->dst.nr == bcast->src[1].nr &&
	inst->dst.offset == bcast->src[1].offset) {
	bcast->opcode = BRW_OPCODE_MOV;
	if (!is_uniform(bcast->src[0]))
	bcast->src[0] = component(bcast->src[0], 0);

	bcast->force_writemask_all = true;
	bcast->exec_size = 8 * reg_unit(s.devinfo);
	assert(bcast->size_written == bcast->dst.component_size(bcast->exec_size));
	bcast->resize_sources(1);
	}
	}
	break;

	default:
	break;
	}
	}

	out:
	if (progress)
	s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DATA_FLOW \|
	BRW_DEPENDENCY_INSTRUCTION_DETAIL);

	return progress;
	}

	/**
	* Rounding modes for conversion instructions are included for each
	* conversion, but right now it is a state. So once it is set,
	* we don't need to call it again for subsequent calls.
	*
	* This is useful for vector/matrices conversions, as setting the
	* mode once is enough for the full vector/matrix
	*/
	bool
	brw_opt_remove_extra_rounding_modes(brw_shader &s)
	{
	bool progress = false;
	unsigned execution_mode = s.nir->info.float_controls_execution_mode;

	brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
	if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 \|
	FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 \|
	FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
	execution_mode)
	base_mode = BRW_RND_MODE_RTNE;
	if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 \|
	FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 \|
	FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
	execution_mode)
	base_mode = BRW_RND_MODE_RTZ;

	foreach_block (block, s.cfg) {
	brw_rnd_mode prev_mode = base_mode;

	foreach_inst_in_block_safe (brw_inst, inst, block) {
	if (inst->opcode == SHADER_OPCODE_RND_MODE) {
	assert(inst->src[0].file == IMM);
	const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
	if (mode == prev_mode) {
	inst->remove();
	progress = true;
	} else {
	prev_mode = mode;
	}
	}
	}
	}

	if (progress)
	s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);

	return progress;
	}

	bool
	brw_opt_send_to_send_gather(brw_shader &s)
	{
	const intel_device_info *devinfo = s.devinfo;
	bool progress = false;

	assert(devinfo->ver >= 30);

	const unsigned unit = reg_unit(devinfo);
	assert(unit == 2);

	unsigned count = 0;

	foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
	if (inst->opcode != SHADER_OPCODE_SEND)
	continue;

	/* For 1-2 registers, send-gather offers no benefits over split-send. */
	if (inst->mlen + inst->ex_mlen <= 2 * unit)
	continue;

	assert(inst->mlen % unit == 0);
	assert(inst->ex_mlen % unit == 0);

	struct {
	brw_reg src;
	unsigned phys_len;
	} payload[2] = {
	{ inst->src[2], inst->mlen / unit },
	{ inst->src[3], inst->ex_mlen / unit },
	};

	const unsigned num_payload_sources = payload[0].phys_len + payload[1].phys_len;

	/* Limited by Src0.Length in the SEND instruction. */
	if (num_payload_sources > 15)
	continue;

	if (INTEL_DEBUG(DEBUG_NO_SEND_GATHER)) {
	count++;
	continue;
	}

	inst->resize_sources(3 + num_payload_sources);
	/* Sources 0 and 1 remain the same. Source 2 will be filled
	* after register allocation.
	*/
	inst->src[2] = {};

	int idx = 3;
	for (unsigned p = 0; p < ARRAY_SIZE(payload); p++) {
	for (unsigned i = 0; i < payload[p].phys_len; i++) {
	inst->src[idx++] = byte_offset(payload[p].src,
	i * reg_unit(devinfo) * REG_SIZE);
	}
	}
	assert(idx == inst->sources);

	inst->opcode = SHADER_OPCODE_SEND_GATHER;
	inst->mlen = 0;
	inst->ex_mlen = 0;

	progress = true;
	}

	if (INTEL_DEBUG(DEBUG_NO_SEND_GATHER)) {
	fprintf(stderr, "Ignored %u opportunities to try SEND_GATHER in %s shader.\n",
	count, _mesa_shader_stage_to_string(s.stage));
	}

	if (progress)
	s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DETAIL \|
	BRW_DEPENDENCY_INSTRUCTION_DATA_FLOW);

	return progress;
	}

	/* If after optimizations, the sources are still contiguous in a
	* SEND_GATHER, prefer to use the regular SEND, which would save
	* having to write the ARF scalar register.
	*/
	bool
	brw_opt_send_gather_to_send(brw_shader &s)
	{
	const intel_device_info *devinfo = s.devinfo;
	bool progress = false;

	assert(devinfo->ver >= 30);

	const unsigned unit = reg_unit(devinfo);
	assert(unit == 2);

	foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
	if (inst->opcode != SHADER_OPCODE_SEND_GATHER)
	continue;

	assert(inst->sources > 2);
	assert(inst->src[2].file == BAD_FILE);

	const int num_payload_sources = inst->sources - 3;
	assert(num_payload_sources > 0);

	/* Limited by Src0.Length in the SEND instruction. */
	assert(num_payload_sources < 16);

	/* Determine whether the sources are still spread in either one or two
	* spans. In those cases the regular SEND instruction can be used
	* and there's no need to use SEND_GATHER (which would set ARF scalar register
	* adding an extra instruction).
	*/
	const brw_reg *payload = &inst->src[3];
	brw_reg payload1 = payload[0];
	brw_reg payload2 = {};
	int payload1_len = 0;
	int payload2_len = 0;

	for (int i = 0; i < num_payload_sources; i++) {
	if (payload[i].file == VGRF &&
	payload[i].nr == payload1.nr &&
	payload[i].offset == payload1_len * REG_SIZE * unit)
	payload1_len++;
	else {
	payload2 = payload[i];
	break;
	}
	}

	if (payload2.file == VGRF) {
	for (int i = payload1_len; i < num_payload_sources; i++) {
	if (payload[i].file == VGRF &&
	payload[i].nr == payload2.nr &&
	payload[i].offset == payload2_len * REG_SIZE * unit)
	payload2_len++;
	else
	break;
	}
	} else {
	payload2 = brw_null_reg();
	}

	if (payload1_len + payload2_len != num_payload_sources)
	continue;

	/* Bspec 57058 (r64705) says
	*
	* When a source data payload is used in dataport message, that payload
	* must be specified as Source 1 portion of a Split Send message.
	*
	* But at this point the split point is not guaranteed to respect that.
	*
	* TODO: Pass LSC address length or infer it so valid splits can work.
	*/
	if (payload2_len && (inst->sfid == BRW_SFID_UGM \|\|
	inst->sfid == BRW_SFID_TGM \|\|
	inst->sfid == BRW_SFID_SLM \|\|
	inst->sfid == BRW_SFID_URB)) {
	enum lsc_opcode lsc_op = lsc_msg_desc_opcode(devinfo, inst->desc);
	if (lsc_op_num_data_values(lsc_op) > 0)
	continue;
	}

	inst->resize_sources(4);
	inst->opcode = SHADER_OPCODE_SEND;
	inst->src[2] = payload1;
	inst->src[3] = payload2;
	inst->mlen = payload1_len * unit;
	inst->ex_mlen = payload2_len * unit;

	progress = true;
	}

	if (progress) {
	s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DETAIL \|
	BRW_DEPENDENCY_INSTRUCTION_DATA_FLOW);
	}

	return progress;
	}