src/intel/compiler/brw_opt_algebraic.cpp - third_party/mesa - Git at Google

 /*
  * Copyright © 2010 Intel Corporation
  * SPDX-License-Identifier: MIT
  */

 #include "brw_shader.h"
 #include "brw_builder.h"
 #include "util/half_float.h"

 static uint64_t
 src_as_uint(const brw_reg &src)
 {
    assert(src.file == IMM);

    switch (src.type) {
    case BRW_TYPE_W:
       return (uint64_t)(int16_t)(src.ud & 0xffff);

    case BRW_TYPE_UW:
       return (uint64_t)(uint16_t)(src.ud & 0xffff);

    case BRW_TYPE_D:
       return (uint64_t)src.d;

    case BRW_TYPE_UD:
       return (uint64_t)src.ud;

    case BRW_TYPE_Q:
       return src.d64;

    case BRW_TYPE_UQ:
       return src.u64;

    default:
       unreachable("Invalid integer type.");
    }
 }

 static double
 src_as_float(const brw_reg &src)
 {
    assert(src.file == IMM);

    switch (src.type) {
    case BRW_TYPE_HF:
       return _mesa_half_to_float((uint16_t)src.d);

    case BRW_TYPE_F:
       return src.f;

    case BRW_TYPE_DF:
       return src.df;

    default:
       unreachable("Invalid float type.");
    }
 }

 static brw_reg
 brw_imm_for_type(uint64_t value, enum brw_reg_type type)
 {
    switch (type) {
    case BRW_TYPE_W:
       return brw_imm_w(value);

    case BRW_TYPE_UW:
       return brw_imm_uw(value);

    case BRW_TYPE_D:
       return brw_imm_d(value);

    case BRW_TYPE_UD:
       return brw_imm_ud(value);

    case BRW_TYPE_Q:
       return brw_imm_d(value);

    case BRW_TYPE_UQ:
       return brw_imm_uq(value);

    default:
       unreachable("Invalid integer type.");
    }
 }

 /**
  * Converts a MAD to an ADD by folding the multiplicand sources.
  */
 static void
 fold_multiplicands_of_MAD(brw_inst *inst)
 {
    assert(inst->opcode == BRW_OPCODE_MAD);
    assert (inst->src[1].file == IMM &&
            inst->src[2].file == IMM &&
            !brw_type_is_vector_imm(inst->src[1].type) &&
            !brw_type_is_vector_imm(inst->src[2].type));

    if (brw_type_is_int(inst->src[1].type)) {
       const uint64_t imm1 = src_as_uint(inst->src[1]);
       const uint64_t imm2 = src_as_uint(inst->src[2]);

       brw_reg product = brw_imm_ud(imm1 * imm2);

       inst->src[1] = retype(product,
                             brw_type_larger_of(inst->src[1].type,
                                                inst->src[2].type));
    } else {
       const double product = src_as_float(inst->src[1]) *
          src_as_float(inst->src[2]);

       switch (brw_type_larger_of(inst->src[1].type,
                                  inst->src[2].type)) {
       case BRW_TYPE_HF:
          inst->src[1] = retype(brw_imm_w(_mesa_float_to_half(product)),
                                BRW_TYPE_HF);
          break;

       case BRW_TYPE_F:
          inst->src[1] = brw_imm_f(product);
          break;

       case BRW_TYPE_DF:
          unreachable("float64 should be impossible.");
          break;

       default:
          unreachable("Invalid float type.");
       }
    }

    inst->opcode = BRW_OPCODE_ADD;
    inst->resize_sources(2);
 }

 bool
 brw_opt_constant_fold_instruction(const intel_device_info *devinfo, brw_inst *inst)
 {
    brw_reg result;

    result.file = BAD_FILE;

    switch (inst->opcode) {
    case BRW_OPCODE_ADD:
       if (inst->src[0].file != IMM || inst->src[1].file != IMM)
          break;

       if (brw_type_is_int(inst->src[0].type)) {
          const uint64_t src0 = src_as_uint(inst->src[0]);
          const uint64_t src1 = src_as_uint(inst->src[1]);

          result = brw_imm_for_type(src0 + src1, inst->dst.type);
       } else {
          assert(inst->src[0].type == BRW_TYPE_F);
          result = brw_imm_f(inst->src[0].f + inst->src[1].f);
       }

       break;

    case BRW_OPCODE_ADD3:
       if (inst->src[0].file == IMM &&
           inst->src[1].file == IMM &&
           inst->src[2].file == IMM) {
          const uint64_t src0 = src_as_uint(inst->src[0]);
          const uint64_t src1 = src_as_uint(inst->src[1]);
          const uint64_t src2 = src_as_uint(inst->src[2]);

          result = brw_imm_for_type(src0 + src1 + src2, inst->dst.type);
       }

       break;

    case BRW_OPCODE_AND:
       if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
          const uint64_t src0 = src_as_uint(inst->src[0]);
          const uint64_t src1 = src_as_uint(inst->src[1]);

          result = brw_imm_for_type(src0 & src1, inst->dst.type);
          break;
       }

       break;

    case BRW_OPCODE_MAD:
       if (inst->src[0].file == IMM &&
           inst->src[1].file == IMM &&
           inst->src[2].file == IMM &&
           !brw_type_is_vector_imm(inst->src[0].type) &&
           !brw_type_is_vector_imm(inst->src[1].type) &&
           !brw_type_is_vector_imm(inst->src[2].type)) {
          fold_multiplicands_of_MAD(inst);
          assert(inst->opcode == BRW_OPCODE_ADD);

          ASSERTED bool folded = brw_opt_constant_fold_instruction(devinfo, inst);
          assert(folded);

          return true;
       }

       break;

    case BRW_OPCODE_MUL:
       if (brw_type_is_float(inst->src[1].type))
          break;

       /* From the BDW PRM, Vol 2a, "mul - Multiply":
        *
        *    "When multiplying integer datatypes, if src0 is DW and src1
        *    is W, irrespective of the destination datatype, the
        *    accumulator maintains full 48-bit precision."
        *    ...
        *    "When multiplying integer data types, if one of the sources
        *    is a DW, the resulting full precision data is stored in
        *    the accumulator."
        *
        * There are also similar notes in earlier PRMs.
        *
        * The MOV instruction can copy the bits of the source, but it
        * does not clear the higher bits of the accumulator. So, because
        * we might use the full accumulator in the MUL/MACH macro, we
        * shouldn't replace such MULs with MOVs.
        */
       if ((brw_type_size_bytes(inst->src[0].type) == 4 ||
            brw_type_size_bytes(inst->src[1].type) == 4) &&
           (inst->dst.is_accumulator() ||
            inst->writes_accumulator_implicitly(devinfo)))
          break;

       if (inst->src[0].is_zero() || inst->src[1].is_zero()) {
          result = brw_imm_d(0);
          break;
       }

       if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
          const uint64_t src0 = src_as_uint(inst->src[0]);
          const uint64_t src1 = src_as_uint(inst->src[1]);

          result = brw_imm_for_type(src0 * src1, inst->dst.type);
          break;
       }
       break;

    case BRW_OPCODE_OR:
       if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
          const uint64_t src0 = src_as_uint(inst->src[0]);
          const uint64_t src1 = src_as_uint(inst->src[1]);

          result = brw_imm_for_type(src0 | src1, inst->dst.type);
          break;
       }

       break;

    case BRW_OPCODE_SHL:
       if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
          /* It's not currently possible to generate this, and this constant
           * folding does not handle it.
           */
          assert(!inst->saturate);

          switch (brw_type_size_bytes(inst->src[0].type)) {
          case 2:
             result = brw_imm_uw(0x0ffff & (inst->src[0].ud << (inst->src[1].ud & 0x1f)));
             break;
          case 4:
             result = brw_imm_ud(inst->src[0].ud << (inst->src[1].ud & 0x1f));
             break;
          case 8:
             result = brw_imm_uq(inst->src[0].u64 << (inst->src[1].ud & 0x3f));
             break;
          default:
             /* Just in case a future platform re-enables B or UB types. */
             unreachable("Invalid source size.");
          }

          result = retype(result, inst->dst.type);
       }
       break;

    case SHADER_OPCODE_BROADCAST:
       if (inst->src[0].file == IMM) {
          inst->opcode = BRW_OPCODE_MOV;
          inst->force_writemask_all = true;
          inst->resize_sources(1);

          /* The destination of BROADCAST will always be is_scalar, so the
           * allocation will always be REG_SIZE * reg_unit. Adjust the
           * exec_size to match.
           */
          inst->exec_size = 8 * reg_unit(devinfo);
          assert(inst->size_written == inst->dst.component_size(inst->exec_size));

          return true;
       }
       break;

    case SHADER_OPCODE_SHUFFLE:
       if (inst->src[0].file == IMM)
          result = inst->src[0];

       break;

    case FS_OPCODE_DDX_COARSE:
    case FS_OPCODE_DDX_FINE:
    case FS_OPCODE_DDY_COARSE:
    case FS_OPCODE_DDY_FINE:
       if (is_uniform(inst->src[0]) || inst->src[0].is_scalar)
          result = retype(brw_imm_uq(0), inst->dst.type);

       break;

    default:
       break;
    }

    if (result.file != BAD_FILE) {
       assert(result.file == IMM);

       inst->opcode = BRW_OPCODE_MOV;
       inst->src[0] = result;
       inst->resize_sources(1);
       return true;
    }

    return false;
 }

 bool
 brw_opt_algebraic(brw_shader &s)
 {
    const intel_device_info *devinfo = s.devinfo;
    bool progress = false;

    foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
       if (brw_opt_constant_fold_instruction(devinfo, inst)) {
          progress = true;
          continue;
       }

       switch (inst->opcode) {
       case BRW_OPCODE_ADD:
          if (brw_type_is_int(inst->src[1].type) &&
                     inst->src[1].is_zero()) {
             inst->opcode = BRW_OPCODE_MOV;
             inst->resize_sources(1);
             progress = true;
          }

          break;

       case BRW_OPCODE_ADD3: {
          const unsigned num_imm = (inst->src[0].file == IMM) +
                                   (inst->src[1].file == IMM) +
                                   (inst->src[2].file == IMM);

          /* If there is more than one immediate value, fold the values and
           * convert the instruction to either ADD or MOV.
           */
          assert(num_imm < 3);
          if (num_imm == 2) {
             uint64_t sum = 0;
             brw_reg src;

             for (unsigned i = 0; i < 3; i++) {
                if (inst->src[i].file == IMM) {
                   sum += src_as_uint(inst->src[i]);
                } else {
                   assert(src.file == BAD_FILE);
                   src = inst->src[i];
                }
             }

             assert(src.file != BAD_FILE);

             if (uint32_t(sum) == 0) {
                inst->opcode = BRW_OPCODE_MOV;
                inst->src[0] = src;
                inst->resize_sources(1);
             } else {
                inst->opcode = BRW_OPCODE_ADD;
                inst->src[0] = src;
                inst->src[1] = brw_imm_ud(sum);
                inst->resize_sources(2);
             }

             progress = true;
          } else if (num_imm == 1) {
             /* If there is a single constant, and that constant is zero,
              * convert the instruction to regular ADD.
              */
             for (unsigned i = 0; i < 3; i++) {
                if (inst->src[i].is_zero()) {
                   inst->opcode = BRW_OPCODE_ADD;
                   inst->src[i] = inst->src[2];
                   inst->resize_sources(2);
                   progress = true;
                   break;
                }
             }
          }

          break;
       }

       case BRW_OPCODE_MOV:
          if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||
               inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
              inst->dst.is_null() &&
              (inst->src[0].abs || inst->src[0].negate)) {
             inst->src[0].abs = false;
             inst->src[0].negate = false;
             progress = true;
             break;
          }

          if (inst->src[0].file != IMM)
             break;

          if (inst->saturate) {
             /* Full mixed-type saturates don't happen.  However, we can end up
              * with things like:
              *
              *    mov.sat(8) g21<1>DF       -1F
              *
              * Other mixed-size-but-same-base-type cases may also be possible.
              */
             if (inst->dst.type != inst->src[0].type &&
                 inst->dst.type != BRW_TYPE_DF &&
                 inst->src[0].type != BRW_TYPE_F)
                unreachable("unimplemented: saturate mixed types");

             if (brw_reg_saturate_immediate(&inst->src[0])) {
                inst->saturate = false;
                progress = true;
             }
          }
          break;

       case BRW_OPCODE_MUL:
          if (brw_type_is_int(inst->src[0].type)){
             /* From the BDW PRM, Vol 2a, "mul - Multiply":
              *
              *    "When multiplying integer datatypes, if src0 is DW and src1
              *    is W, irrespective of the destination datatype, the
              *    accumulator maintains full 48-bit precision."
              *    ...
              *    "When multiplying integer data types, if one of the sources
              *    is a DW, the resulting full precision data is stored in the
              *    accumulator."
              *
              * There are also similar notes in earlier PRMs.
              *
              * The MOV instruction can copy the bits of the source, but it
              * does not clear the higher bits of the accumulator. So, because
              * we might use the full accumulator in the MUL/MACH macro, we
              * shouldn't replace such MULs with MOVs.
              */
             if ((brw_type_size_bytes(inst->src[0].type) == 4 ||
                  brw_type_size_bytes(inst->src[1].type) == 4) &&
                 (inst->dst.is_accumulator() ||
                  inst->writes_accumulator_implicitly(devinfo)))
                break;

             for (unsigned i = 0; i < 2; i++) {
                /* a * 1 = a */
                if (inst->src[i].is_one()) {
                   inst->opcode = BRW_OPCODE_MOV;
                } else if (inst->src[i].is_negative_one()) {
                   /* a * -1 = -a */
                   inst->opcode = BRW_OPCODE_MOV;

                   /* If the source other than the -1 is immediate, just
                    * toggling the negation flag will not work. Due to the
                    * previous call to brw_constant_fold_instruction, this
                    * should not be possible.
                    */
                   assert(inst->src[1 - i].file != IMM);
                   inst->src[1 - i].negate = !inst->src[1 - i].negate;
                }

                if (inst->opcode == BRW_OPCODE_MOV) {
                   /* If the literal 1 was src0, put the old src1 in src0. */
                   if (i == 0)
                      inst->src[0] = inst->src[1];

                   inst->resize_sources(1);
                   progress = true;
                   break;
                }
             }
          }
          break;

       case BRW_OPCODE_NOT:
          /*    not.nz    null, g17
           *
           * becomes
           *
           *    mov.z     null, g17
           *
           * These are equivalent, but the latter is easier for cmod prop.
           */
          if (inst->dst.is_null() &&
              inst->conditional_mod != BRW_CONDITIONAL_NONE) {
             assert(!inst->src[0].abs);

             if (!inst->src[0].negate)
                inst->conditional_mod = brw_negate_cmod(inst->conditional_mod);

             inst->opcode = BRW_OPCODE_MOV;
             inst->src[0].negate = false;
             progress = true;
          }
          break;

       case BRW_OPCODE_OR:
          if (inst->src[0].equals(inst->src[1]) || inst->src[1].is_zero()) {
             /* On Gfx8+, the OR instruction can have a source modifier that
              * performs logical not on the operand.  Cases of 'OR r0, ~r1, 0'
              * or 'OR r0, ~r1, ~r1' should become a NOT instead of a MOV.
              */
             if (inst->src[0].negate) {
                inst->opcode = BRW_OPCODE_NOT;
                inst->src[0].negate = false;
             } else {
                inst->opcode = BRW_OPCODE_MOV;
             }
             inst->resize_sources(1);
             progress = true;
             break;
          }
          break;
       case BRW_OPCODE_CMP:
          if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||
               inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
              inst->src[1].is_zero() &&
              (inst->src[0].abs || inst->src[0].negate)) {
             inst->src[0].abs = false;
             inst->src[0].negate = false;
             progress = true;
             break;
          }
          break;
       case BRW_OPCODE_SEL:
          /* Floating point SEL.CMOD may flush denorms to zero. We don't have
           * enough information at this point in compilation to know whether or
           * not it is safe to remove that.
           *
           * Integer SEL or SEL without a conditional modifier is just a fancy
           * MOV. Those are always safe to eliminate.
           */
          if (inst->src[0].equals(inst->src[1]) &&
              (!brw_type_is_float(inst->dst.type) ||
               inst->conditional_mod == BRW_CONDITIONAL_NONE)) {
             inst->opcode = BRW_OPCODE_MOV;
             inst->predicate = BRW_PREDICATE_NONE;
             inst->predicate_inverse = false;
             inst->conditional_mod = BRW_CONDITIONAL_NONE;
             inst->resize_sources(1);
             progress = true;
          } else if (inst->saturate && inst->src[1].file == IMM) {
             switch (inst->conditional_mod) {
             case BRW_CONDITIONAL_LE:
             case BRW_CONDITIONAL_L:
                switch (inst->src[1].type) {
                case BRW_TYPE_F:
                   if (inst->src[1].f >= 1.0f) {
                      inst->opcode = BRW_OPCODE_MOV;
                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
                      inst->resize_sources(1);
                      progress = true;
                   }
                   break;
                default:
                   break;
                }
                break;
             case BRW_CONDITIONAL_GE:
             case BRW_CONDITIONAL_G:
                switch (inst->src[1].type) {
                case BRW_TYPE_F:
                   if (inst->src[1].f <= 0.0f) {
                      inst->opcode = BRW_OPCODE_MOV;
                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
                      inst->resize_sources(1);
                      progress = true;
                   }
                   break;
                default:
                   break;
                }
                break;
             default:
                break;
             }
          }
          break;
       case BRW_OPCODE_CSEL:
          if (brw_type_is_float(inst->dst.type)) {
             /* This transformation can both clean up spurious modifiers
              * (making assembly dumps easier to read) and convert GE with -abs
              * to LE with abs. See abs handling below.
              */
             if (inst->src[2].negate) {
                inst->conditional_mod = brw_swap_cmod(inst->conditional_mod);
                inst->src[2].negate = false;
                progress = true;
             }

             if (inst->src[2].abs) {
                switch (inst->conditional_mod) {
                case BRW_CONDITIONAL_Z:
                case BRW_CONDITIONAL_NZ:
                   inst->src[2].abs = false;
                   progress = true;
                   break;

                case BRW_CONDITIONAL_LE:
                   /* Converting to Z can help constant propagation into src0
                    * and src1.
                    */
                   inst->conditional_mod = BRW_CONDITIONAL_Z;
                   inst->src[2].abs = false;
                   progress = true;
                   break;

                default:
                   /* GE or L conditions with absolute value could be used to
                    * implement isnan(x) in CSEL. Transforming G with absolute
                    * value to NZ is **not** NaN safe.
                    */
                   break;
                }
             }
          } else if (brw_type_is_sint(inst->src[2].type)) {
             /* Integer transformations are more challenging than floating
              * point transformations due to INT_MIN == -(INT_MIN) ==
              * abs(INT_MIN).
              */
             if (inst->src[2].negate && inst->src[2].abs) {
                switch (inst->conditional_mod) {
                case BRW_CONDITIONAL_GE:
                   inst->src[2].negate = false;
                   inst->src[2].abs = false;
                   inst->conditional_mod = BRW_CONDITIONAL_Z;
                   progress = true;
                   break;
                case BRW_CONDITIONAL_L:
                   inst->src[2].negate = false;
                   inst->src[2].abs = false;
                   inst->conditional_mod = BRW_CONDITIONAL_NZ;
                   progress = true;
                   break;
                case BRW_CONDITIONAL_G:
                   /* This is a contradtion. -abs(x) cannot be > 0. */
                   inst->opcode = BRW_OPCODE_MOV;
                   inst->src[0] = inst->src[1];
                   inst->resize_sources(1);
                   progress = true;
                   break;
                case BRW_CONDITIONAL_LE:
                   /* This is a tautology. -abs(x) must be <= 0. */
                   inst->opcode = BRW_OPCODE_MOV;
                   inst->resize_sources(1);
                   progress = true;
                   break;
                case BRW_CONDITIONAL_Z:
                case BRW_CONDITIONAL_NZ:
                   inst->src[2].negate = false;
                   inst->src[2].abs = false;
                   progress = true;
                   break;
                default:
                   unreachable("Impossible icsel condition.");
                }
             }
          }
          break;
       case BRW_OPCODE_MAD:
          if (inst->src[1].file == IMM &&
              inst->src[2].file == IMM &&
              !brw_type_is_vector_imm(inst->src[1].type) &&
              !brw_type_is_vector_imm(inst->src[2].type)) {
             fold_multiplicands_of_MAD(inst);

             /* This could result in (x + 0). For floats, we want to leave this
              * as an ADD so that a subnormal x will get flushed to zero.
              */
             assert(inst->opcode == BRW_OPCODE_ADD);
             progress = true;
             break;
          }

          if (inst->src[1].is_one()) {
             inst->opcode = BRW_OPCODE_ADD;
             inst->src[1] = inst->src[2];
             inst->resize_sources(2);
             progress = true;
          } else if (inst->src[2].is_one()) {
             inst->opcode = BRW_OPCODE_ADD;
             inst->resize_sources(2);
             progress = true;
          }
          break;
       case SHADER_OPCODE_BROADCAST:
          if (is_uniform(inst->src[0])) {
             inst->opcode = BRW_OPCODE_MOV;
             inst->force_writemask_all = true;

             /* The destination of BROADCAST will always be is_scalar, so the
              * allocation will always be REG_SIZE * reg_unit. Adjust the
              * exec_size to match.
              */
             inst->exec_size = 8 * reg_unit(devinfo);
             assert(inst->size_written == inst->dst.component_size(inst->exec_size));
             inst->resize_sources(1);
             progress = true;
          } else if (inst->src[1].file == IMM) {
             inst->opcode = BRW_OPCODE_MOV;
             /* It's possible that the selected component will be too large and
              * overflow the register.  This can happen if someone does a
              * readInvocation() from GLSL or SPIR-V and provides an OOB
              * invocationIndex.  If this happens and we some how manage
              * to constant fold it in and get here, then component() may cause
              * us to start reading outside of the VGRF which will lead to an
              * assert later.  Instead, just let it wrap around if it goes over
              * exec_size.
              */
             const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
             inst->src[0] = component(inst->src[0], comp);
             inst->force_writemask_all = true;
             inst->exec_size = 8 * reg_unit(devinfo);
             assert(inst->size_written == inst->dst.component_size(inst->exec_size));
             inst->resize_sources(1);
             progress = true;
          }
          break;

       case SHADER_OPCODE_SHUFFLE:
          if (is_uniform(inst->src[0])) {
             inst->opcode = BRW_OPCODE_MOV;
             inst->resize_sources(1);
             progress = true;
          } else if (inst->src[1].file == IMM) {
             inst->opcode = BRW_OPCODE_MOV;
             inst->src[0] = component(inst->src[0],
                                      inst->src[1].ud);
             inst->resize_sources(1);
             progress = true;
          }
          break;

       default:
 	 break;
       }

       /* Ensure that the correct source has the immediate value. 2-source
        * instructions must have the immediate in src[1]. On Gfx12 and later,
        * some 3-source instructions can have the immediate in src[0] or
        * src[2]. It's complicated, so don't mess with 3-source instructions
        * here.
        */
       if (progress && inst->sources == 2 && inst->is_commutative()) {
          if (inst->src[0].file == IMM) {
             brw_reg tmp = inst->src[1];
             inst->src[1] = inst->src[0];
             inst->src[0] = tmp;
          }
       }
    }

    if (progress)
       s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DATA_FLOW |
                             BRW_DEPENDENCY_INSTRUCTION_DETAIL);

    return progress;
 }
	/*
	* Copyright © 2010 Intel Corporation
	* SPDX-License-Identifier: MIT
	*/

	#include "brw_shader.h"
	#include "brw_builder.h"
	#include "util/half_float.h"

	static uint64_t
	src_as_uint(const brw_reg &src)
	{
	assert(src.file == IMM);

	switch (src.type) {
	case BRW_TYPE_W:
	return (uint64_t)(int16_t)(src.ud & 0xffff);

	case BRW_TYPE_UW:
	return (uint64_t)(uint16_t)(src.ud & 0xffff);

	case BRW_TYPE_D:
	return (uint64_t)src.d;

	case BRW_TYPE_UD:
	return (uint64_t)src.ud;

	case BRW_TYPE_Q:
	return src.d64;

	case BRW_TYPE_UQ:
	return src.u64;

	default:
	unreachable("Invalid integer type.");
	}
	}

	static double
	src_as_float(const brw_reg &src)
	{
	assert(src.file == IMM);

	switch (src.type) {
	case BRW_TYPE_HF:
	return _mesa_half_to_float((uint16_t)src.d);

	case BRW_TYPE_F:
	return src.f;

	case BRW_TYPE_DF:
	return src.df;

	default:
	unreachable("Invalid float type.");
	}
	}

	static brw_reg
	brw_imm_for_type(uint64_t value, enum brw_reg_type type)
	{
	switch (type) {
	case BRW_TYPE_W:
	return brw_imm_w(value);

	case BRW_TYPE_UW:
	return brw_imm_uw(value);

	case BRW_TYPE_D:
	return brw_imm_d(value);

	case BRW_TYPE_UD:
	return brw_imm_ud(value);

	case BRW_TYPE_Q:
	return brw_imm_d(value);

	case BRW_TYPE_UQ:
	return brw_imm_uq(value);

	default:
	unreachable("Invalid integer type.");
	}
	}

	/**
	* Converts a MAD to an ADD by folding the multiplicand sources.
	*/
	static void
	fold_multiplicands_of_MAD(brw_inst *inst)
	{
	assert(inst->opcode == BRW_OPCODE_MAD);
	assert (inst->src[1].file == IMM &&
	inst->src[2].file == IMM &&
	!brw_type_is_vector_imm(inst->src[1].type) &&
	!brw_type_is_vector_imm(inst->src[2].type));

	if (brw_type_is_int(inst->src[1].type)) {
	const uint64_t imm1 = src_as_uint(inst->src[1]);
	const uint64_t imm2 = src_as_uint(inst->src[2]);

	brw_reg product = brw_imm_ud(imm1 * imm2);

	inst->src[1] = retype(product,
	brw_type_larger_of(inst->src[1].type,
	inst->src[2].type));
	} else {
	const double product = src_as_float(inst->src[1]) *
	src_as_float(inst->src[2]);

	switch (brw_type_larger_of(inst->src[1].type,
	inst->src[2].type)) {
	case BRW_TYPE_HF:
	inst->src[1] = retype(brw_imm_w(_mesa_float_to_half(product)),
	BRW_TYPE_HF);
	break;

	case BRW_TYPE_F:
	inst->src[1] = brw_imm_f(product);
	break;

	case BRW_TYPE_DF:
	unreachable("float64 should be impossible.");
	break;

	default:
	unreachable("Invalid float type.");
	}
	}

	inst->opcode = BRW_OPCODE_ADD;
	inst->resize_sources(2);
	}

	bool
	brw_opt_constant_fold_instruction(const intel_device_info devinfo, brw_inst inst)
	{
	brw_reg result;

	result.file = BAD_FILE;

	switch (inst->opcode) {
	case BRW_OPCODE_ADD:
	if (inst->src[0].file != IMM \|\| inst->src[1].file != IMM)
	break;

	if (brw_type_is_int(inst->src[0].type)) {
	const uint64_t src0 = src_as_uint(inst->src[0]);
	const uint64_t src1 = src_as_uint(inst->src[1]);

	result = brw_imm_for_type(src0 + src1, inst->dst.type);
	} else {
	assert(inst->src[0].type == BRW_TYPE_F);
	result = brw_imm_f(inst->src[0].f + inst->src[1].f);
	}

	break;

	case BRW_OPCODE_ADD3:
	if (inst->src[0].file == IMM &&
	inst->src[1].file == IMM &&
	inst->src[2].file == IMM) {
	const uint64_t src0 = src_as_uint(inst->src[0]);
	const uint64_t src1 = src_as_uint(inst->src[1]);
	const uint64_t src2 = src_as_uint(inst->src[2]);

	result = brw_imm_for_type(src0 + src1 + src2, inst->dst.type);
	}

	break;

	case BRW_OPCODE_AND:
	if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
	const uint64_t src0 = src_as_uint(inst->src[0]);
	const uint64_t src1 = src_as_uint(inst->src[1]);

	result = brw_imm_for_type(src0 & src1, inst->dst.type);
	break;
	}

	break;

	case BRW_OPCODE_MAD:
	if (inst->src[0].file == IMM &&
	inst->src[1].file == IMM &&
	inst->src[2].file == IMM &&
	!brw_type_is_vector_imm(inst->src[0].type) &&
	!brw_type_is_vector_imm(inst->src[1].type) &&
	!brw_type_is_vector_imm(inst->src[2].type)) {
	fold_multiplicands_of_MAD(inst);
	assert(inst->opcode == BRW_OPCODE_ADD);

	ASSERTED bool folded = brw_opt_constant_fold_instruction(devinfo, inst);
	assert(folded);

	return true;
	}

	break;

	case BRW_OPCODE_MUL:
	if (brw_type_is_float(inst->src[1].type))
	break;

	/* From the BDW PRM, Vol 2a, "mul - Multiply":
	*
	* "When multiplying integer datatypes, if src0 is DW and src1
	* is W, irrespective of the destination datatype, the
	* accumulator maintains full 48-bit precision."
	* ...
	* "When multiplying integer data types, if one of the sources
	* is a DW, the resulting full precision data is stored in
	* the accumulator."
	*
	* There are also similar notes in earlier PRMs.
	*
	* The MOV instruction can copy the bits of the source, but it
	* does not clear the higher bits of the accumulator. So, because
	* we might use the full accumulator in the MUL/MACH macro, we
	* shouldn't replace such MULs with MOVs.
	*/
	if ((brw_type_size_bytes(inst->src[0].type) == 4 \|\|
	brw_type_size_bytes(inst->src[1].type) == 4) &&
	(inst->dst.is_accumulator() \|\|
	inst->writes_accumulator_implicitly(devinfo)))
	break;

	if (inst->src[0].is_zero() \|\| inst->src[1].is_zero()) {
	result = brw_imm_d(0);
	break;
	}

	if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
	const uint64_t src0 = src_as_uint(inst->src[0]);
	const uint64_t src1 = src_as_uint(inst->src[1]);

	result = brw_imm_for_type(src0 * src1, inst->dst.type);
	break;
	}
	break;

	case BRW_OPCODE_OR:
	if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
	const uint64_t src0 = src_as_uint(inst->src[0]);
	const uint64_t src1 = src_as_uint(inst->src[1]);

	result = brw_imm_for_type(src0 \| src1, inst->dst.type);
	break;
	}

	break;

	case BRW_OPCODE_SHL:
	if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
	/* It's not currently possible to generate this, and this constant
	* folding does not handle it.
	*/
	assert(!inst->saturate);

	switch (brw_type_size_bytes(inst->src[0].type)) {
	case 2:
	result = brw_imm_uw(0x0ffff & (inst->src[0].ud << (inst->src[1].ud & 0x1f)));
	break;
	case 4:
	result = brw_imm_ud(inst->src[0].ud << (inst->src[1].ud & 0x1f));
	break;
	case 8:
	result = brw_imm_uq(inst->src[0].u64 << (inst->src[1].ud & 0x3f));
	break;
	default:
	/* Just in case a future platform re-enables B or UB types. */
	unreachable("Invalid source size.");
	}

	result = retype(result, inst->dst.type);
	}
	break;

	case SHADER_OPCODE_BROADCAST:
	if (inst->src[0].file == IMM) {
	inst->opcode = BRW_OPCODE_MOV;
	inst->force_writemask_all = true;
	inst->resize_sources(1);

	/* The destination of BROADCAST will always be is_scalar, so the
	* allocation will always be REG_SIZE * reg_unit. Adjust the
	* exec_size to match.
	*/
	inst->exec_size = 8 * reg_unit(devinfo);
	assert(inst->size_written == inst->dst.component_size(inst->exec_size));

	return true;
	}
	break;

	case SHADER_OPCODE_SHUFFLE:
	if (inst->src[0].file == IMM)
	result = inst->src[0];

	break;

	case FS_OPCODE_DDX_COARSE:
	case FS_OPCODE_DDX_FINE:
	case FS_OPCODE_DDY_COARSE:
	case FS_OPCODE_DDY_FINE:
	if (is_uniform(inst->src[0]) \|\| inst->src[0].is_scalar)
	result = retype(brw_imm_uq(0), inst->dst.type);

	break;

	default:
	break;
	}

	if (result.file != BAD_FILE) {
	assert(result.file == IMM);

	inst->opcode = BRW_OPCODE_MOV;
	inst->src[0] = result;
	inst->resize_sources(1);
	return true;
	}

	return false;
	}

	bool
	brw_opt_algebraic(brw_shader &s)
	{
	const intel_device_info *devinfo = s.devinfo;
	bool progress = false;

	foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
	if (brw_opt_constant_fold_instruction(devinfo, inst)) {
	progress = true;
	continue;
	}

	switch (inst->opcode) {
	case BRW_OPCODE_ADD:
	if (brw_type_is_int(inst->src[1].type) &&
	inst->src[1].is_zero()) {
	inst->opcode = BRW_OPCODE_MOV;
	inst->resize_sources(1);
	progress = true;
	}

	break;

	case BRW_OPCODE_ADD3: {
	const unsigned num_imm = (inst->src[0].file == IMM) +
	(inst->src[1].file == IMM) +
	(inst->src[2].file == IMM);

	/* If there is more than one immediate value, fold the values and
	* convert the instruction to either ADD or MOV.
	*/
	assert(num_imm < 3);
	if (num_imm == 2) {
	uint64_t sum = 0;
	brw_reg src;

	for (unsigned i = 0; i < 3; i++) {
	if (inst->src[i].file == IMM) {
	sum += src_as_uint(inst->src[i]);
	} else {
	assert(src.file == BAD_FILE);
	src = inst->src[i];
	}
	}

	assert(src.file != BAD_FILE);

	if (uint32_t(sum) == 0) {
	inst->opcode = BRW_OPCODE_MOV;
	inst->src[0] = src;
	inst->resize_sources(1);
	} else {
	inst->opcode = BRW_OPCODE_ADD;
	inst->src[0] = src;
	inst->src[1] = brw_imm_ud(sum);
	inst->resize_sources(2);
	}

	progress = true;
	} else if (num_imm == 1) {
	/* If there is a single constant, and that constant is zero,
	* convert the instruction to regular ADD.
	*/
	for (unsigned i = 0; i < 3; i++) {
	if (inst->src[i].is_zero()) {
	inst->opcode = BRW_OPCODE_ADD;
	inst->src[i] = inst->src[2];
	inst->resize_sources(2);
	progress = true;
	break;
	}
	}
	}

	break;
	}

	case BRW_OPCODE_MOV:
	if ((inst->conditional_mod == BRW_CONDITIONAL_Z \|\|
	inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
	inst->dst.is_null() &&
	(inst->src[0].abs \|\| inst->src[0].negate)) {
	inst->src[0].abs = false;
	inst->src[0].negate = false;
	progress = true;
	break;
	}

	if (inst->src[0].file != IMM)
	break;

	if (inst->saturate) {
	/* Full mixed-type saturates don't happen. However, we can end up
	* with things like:
	*
	* mov.sat(8) g21<1>DF -1F
	*
	* Other mixed-size-but-same-base-type cases may also be possible.
	*/
	if (inst->dst.type != inst->src[0].type &&
	inst->dst.type != BRW_TYPE_DF &&
	inst->src[0].type != BRW_TYPE_F)
	unreachable("unimplemented: saturate mixed types");

	if (brw_reg_saturate_immediate(&inst->src[0])) {
	inst->saturate = false;
	progress = true;
	}
	}
	break;

	case BRW_OPCODE_MUL:
	if (brw_type_is_int(inst->src[0].type)){
	/* From the BDW PRM, Vol 2a, "mul - Multiply":
	*
	* "When multiplying integer datatypes, if src0 is DW and src1
	* is W, irrespective of the destination datatype, the
	* accumulator maintains full 48-bit precision."
	* ...
	* "When multiplying integer data types, if one of the sources
	* is a DW, the resulting full precision data is stored in the
	* accumulator."
	*
	* There are also similar notes in earlier PRMs.
	*
	* The MOV instruction can copy the bits of the source, but it
	* does not clear the higher bits of the accumulator. So, because
	* we might use the full accumulator in the MUL/MACH macro, we
	* shouldn't replace such MULs with MOVs.
	*/
	if ((brw_type_size_bytes(inst->src[0].type) == 4 \|\|
	brw_type_size_bytes(inst->src[1].type) == 4) &&
	(inst->dst.is_accumulator() \|\|
	inst->writes_accumulator_implicitly(devinfo)))
	break;

	for (unsigned i = 0; i < 2; i++) {
	/* a * 1 = a */
	if (inst->src[i].is_one()) {
	inst->opcode = BRW_OPCODE_MOV;
	} else if (inst->src[i].is_negative_one()) {
	/* a * -1 = -a */
	inst->opcode = BRW_OPCODE_MOV;

	/* If the source other than the -1 is immediate, just
	* toggling the negation flag will not work. Due to the
	* previous call to brw_constant_fold_instruction, this
	* should not be possible.
	*/
	assert(inst->src[1 - i].file != IMM);
	inst->src[1 - i].negate = !inst->src[1 - i].negate;
	}

	if (inst->opcode == BRW_OPCODE_MOV) {
	/* If the literal 1 was src0, put the old src1 in src0. */
	if (i == 0)
	inst->src[0] = inst->src[1];

	inst->resize_sources(1);
	progress = true;
	break;
	}
	}
	}
	break;

	case BRW_OPCODE_NOT:
	/* not.nz null, g17
	*
	* becomes
	*
	* mov.z null, g17
	*
	* These are equivalent, but the latter is easier for cmod prop.
	*/
	if (inst->dst.is_null() &&
	inst->conditional_mod != BRW_CONDITIONAL_NONE) {
	assert(!inst->src[0].abs);

	if (!inst->src[0].negate)
	inst->conditional_mod = brw_negate_cmod(inst->conditional_mod);

	inst->opcode = BRW_OPCODE_MOV;
	inst->src[0].negate = false;
	progress = true;
	}
	break;

	case BRW_OPCODE_OR:
	if (inst->src[0].equals(inst->src[1]) \|\| inst->src[1].is_zero()) {
	/* On Gfx8+, the OR instruction can have a source modifier that
	* performs logical not on the operand. Cases of 'OR r0, ~r1, 0'
	* or 'OR r0, ~r1, ~r1' should become a NOT instead of a MOV.
	*/
	if (inst->src[0].negate) {
	inst->opcode = BRW_OPCODE_NOT;
	inst->src[0].negate = false;
	} else {
	inst->opcode = BRW_OPCODE_MOV;
	}
	inst->resize_sources(1);
	progress = true;
	break;
	}
	break;
	case BRW_OPCODE_CMP:
	if ((inst->conditional_mod == BRW_CONDITIONAL_Z \|\|
	inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
	inst->src[1].is_zero() &&
	(inst->src[0].abs \|\| inst->src[0].negate)) {
	inst->src[0].abs = false;
	inst->src[0].negate = false;
	progress = true;
	break;
	}
	break;
	case BRW_OPCODE_SEL:
	/* Floating point SEL.CMOD may flush denorms to zero. We don't have
	* enough information at this point in compilation to know whether or
	* not it is safe to remove that.
	*
	* Integer SEL or SEL without a conditional modifier is just a fancy
	* MOV. Those are always safe to eliminate.
	*/
	if (inst->src[0].equals(inst->src[1]) &&
	(!brw_type_is_float(inst->dst.type) \|\|
	inst->conditional_mod == BRW_CONDITIONAL_NONE)) {
	inst->opcode = BRW_OPCODE_MOV;
	inst->predicate = BRW_PREDICATE_NONE;
	inst->predicate_inverse = false;
	inst->conditional_mod = BRW_CONDITIONAL_NONE;
	inst->resize_sources(1);
	progress = true;
	} else if (inst->saturate && inst->src[1].file == IMM) {
	switch (inst->conditional_mod) {
	case BRW_CONDITIONAL_LE:
	case BRW_CONDITIONAL_L:
	switch (inst->src[1].type) {
	case BRW_TYPE_F:
	if (inst->src[1].f >= 1.0f) {
	inst->opcode = BRW_OPCODE_MOV;
	inst->conditional_mod = BRW_CONDITIONAL_NONE;
	inst->resize_sources(1);
	progress = true;
	}
	break;
	default:
	break;
	}
	break;
	case BRW_CONDITIONAL_GE:
	case BRW_CONDITIONAL_G:
	switch (inst->src[1].type) {
	case BRW_TYPE_F:
	if (inst->src[1].f <= 0.0f) {
	inst->opcode = BRW_OPCODE_MOV;
	inst->conditional_mod = BRW_CONDITIONAL_NONE;
	inst->resize_sources(1);
	progress = true;
	}
	break;
	default:
	break;
	}
	break;
	default:
	break;
	}
	}
	break;
	case BRW_OPCODE_CSEL:
	if (brw_type_is_float(inst->dst.type)) {
	/* This transformation can both clean up spurious modifiers
	* (making assembly dumps easier to read) and convert GE with -abs
	* to LE with abs. See abs handling below.
	*/
	if (inst->src[2].negate) {
	inst->conditional_mod = brw_swap_cmod(inst->conditional_mod);
	inst->src[2].negate = false;
	progress = true;
	}

	if (inst->src[2].abs) {
	switch (inst->conditional_mod) {
	case BRW_CONDITIONAL_Z:
	case BRW_CONDITIONAL_NZ:
	inst->src[2].abs = false;
	progress = true;
	break;

	case BRW_CONDITIONAL_LE:
	/* Converting to Z can help constant propagation into src0
	* and src1.
	*/
	inst->conditional_mod = BRW_CONDITIONAL_Z;
	inst->src[2].abs = false;
	progress = true;
	break;

	default:
	/* GE or L conditions with absolute value could be used to
	* implement isnan(x) in CSEL. Transforming G with absolute
	* value to NZ is not NaN safe.
	*/
	break;
	}
	}
	} else if (brw_type_is_sint(inst->src[2].type)) {
	/* Integer transformations are more challenging than floating
	* point transformations due to INT_MIN == -(INT_MIN) ==
	* abs(INT_MIN).
	*/
	if (inst->src[2].negate && inst->src[2].abs) {
	switch (inst->conditional_mod) {
	case BRW_CONDITIONAL_GE:
	inst->src[2].negate = false;
	inst->src[2].abs = false;
	inst->conditional_mod = BRW_CONDITIONAL_Z;
	progress = true;
	break;
	case BRW_CONDITIONAL_L:
	inst->src[2].negate = false;
	inst->src[2].abs = false;
	inst->conditional_mod = BRW_CONDITIONAL_NZ;
	progress = true;
	break;
	case BRW_CONDITIONAL_G:
	/* This is a contradtion. -abs(x) cannot be > 0. */
	inst->opcode = BRW_OPCODE_MOV;
	inst->src[0] = inst->src[1];
	inst->resize_sources(1);
	progress = true;
	break;
	case BRW_CONDITIONAL_LE:
	/* This is a tautology. -abs(x) must be <= 0. */
	inst->opcode = BRW_OPCODE_MOV;
	inst->resize_sources(1);
	progress = true;
	break;
	case BRW_CONDITIONAL_Z:
	case BRW_CONDITIONAL_NZ:
	inst->src[2].negate = false;
	inst->src[2].abs = false;
	progress = true;
	break;
	default:
	unreachable("Impossible icsel condition.");
	}
	}
	}
	break;
	case BRW_OPCODE_MAD:
	if (inst->src[1].file == IMM &&
	inst->src[2].file == IMM &&
	!brw_type_is_vector_imm(inst->src[1].type) &&
	!brw_type_is_vector_imm(inst->src[2].type)) {
	fold_multiplicands_of_MAD(inst);

	/* This could result in (x + 0). For floats, we want to leave this
	* as an ADD so that a subnormal x will get flushed to zero.
	*/
	assert(inst->opcode == BRW_OPCODE_ADD);
	progress = true;
	break;
	}

	if (inst->src[1].is_one()) {
	inst->opcode = BRW_OPCODE_ADD;
	inst->src[1] = inst->src[2];
	inst->resize_sources(2);
	progress = true;
	} else if (inst->src[2].is_one()) {
	inst->opcode = BRW_OPCODE_ADD;
	inst->resize_sources(2);
	progress = true;
	}
	break;
	case SHADER_OPCODE_BROADCAST:
	if (is_uniform(inst->src[0])) {
	inst->opcode = BRW_OPCODE_MOV;
	inst->force_writemask_all = true;

	/* The destination of BROADCAST will always be is_scalar, so the
	* allocation will always be REG_SIZE * reg_unit. Adjust the
	* exec_size to match.
	*/
	inst->exec_size = 8 * reg_unit(devinfo);
	assert(inst->size_written == inst->dst.component_size(inst->exec_size));
	inst->resize_sources(1);
	progress = true;
	} else if (inst->src[1].file == IMM) {
	inst->opcode = BRW_OPCODE_MOV;
	/* It's possible that the selected component will be too large and
	* overflow the register. This can happen if someone does a
	* readInvocation() from GLSL or SPIR-V and provides an OOB
	* invocationIndex. If this happens and we some how manage
	* to constant fold it in and get here, then component() may cause
	* us to start reading outside of the VGRF which will lead to an
	* assert later. Instead, just let it wrap around if it goes over
	* exec_size.
	*/
	const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
	inst->src[0] = component(inst->src[0], comp);
	inst->force_writemask_all = true;
	inst->exec_size = 8 * reg_unit(devinfo);
	assert(inst->size_written == inst->dst.component_size(inst->exec_size));
	inst->resize_sources(1);
	progress = true;
	}
	break;

	case SHADER_OPCODE_SHUFFLE:
	if (is_uniform(inst->src[0])) {
	inst->opcode = BRW_OPCODE_MOV;
	inst->resize_sources(1);
	progress = true;
	} else if (inst->src[1].file == IMM) {
	inst->opcode = BRW_OPCODE_MOV;
	inst->src[0] = component(inst->src[0],
	inst->src[1].ud);
	inst->resize_sources(1);
	progress = true;
	}
	break;

	default:
	break;
	}

	/* Ensure that the correct source has the immediate value. 2-source
	* instructions must have the immediate in src[1]. On Gfx12 and later,
	* some 3-source instructions can have the immediate in src[0] or
	* src[2]. It's complicated, so don't mess with 3-source instructions
	* here.
	*/
	if (progress && inst->sources == 2 && inst->is_commutative()) {
	if (inst->src[0].file == IMM) {
	brw_reg tmp = inst->src[1];
	inst->src[1] = inst->src[0];
	inst->src[0] = tmp;
	}
	}
	}

	if (progress)
	s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DATA_FLOW \|
	BRW_DEPENDENCY_INSTRUCTION_DETAIL);

	return progress;
	}