src/intel/compiler/brw_lower_subgroup_ops.cpp - third_party/mesa - Git at Google

 /*
  * Copyright 2024 Intel Corporation
  * SPDX-License-Identifier: MIT
  */

 #include <stdint.h>
 #include "util/half_float.h"

 #include "brw_shader.h"
 #include "brw_builder.h"

 struct brw_reduction_info {
    brw_reg             identity;
    enum opcode         op;
    brw_conditional_mod cond_mod;
 };

 static brw_reduction_info
 brw_get_reduction_info(brw_reduce_op red_op, brw_reg_type type)
 {
    struct brw_reduction_info info;

    info.op = BRW_OPCODE_SEL;
    info.cond_mod = BRW_CONDITIONAL_NONE;

    switch (red_op) {
    case BRW_REDUCE_OP_ADD: info.op = BRW_OPCODE_ADD;           break;
    case BRW_REDUCE_OP_MUL: info.op = BRW_OPCODE_MUL;           break;
    case BRW_REDUCE_OP_AND: info.op = BRW_OPCODE_AND;           break;
    case BRW_REDUCE_OP_OR:  info.op = BRW_OPCODE_OR;            break;
    case BRW_REDUCE_OP_XOR: info.op = BRW_OPCODE_XOR;           break;
    case BRW_REDUCE_OP_MIN: info.cond_mod = BRW_CONDITIONAL_L;  break;
    case BRW_REDUCE_OP_MAX: info.cond_mod = BRW_CONDITIONAL_GE; break;
    default:
       unreachable("invalid reduce op");
    }

    switch (red_op) {
    case BRW_REDUCE_OP_ADD:
    case BRW_REDUCE_OP_XOR:
    case BRW_REDUCE_OP_OR:
       info.identity = retype(brw_imm_u64(0), type);
       return info;
    case BRW_REDUCE_OP_AND:
       info.identity = retype(brw_imm_u64(~0ull), type);
       return info;
    default:
       /* Continue below. */
       break;
    }

    brw_reg id;
    const unsigned size = brw_type_size_bytes(type);

    switch (red_op) {
    case BRW_REDUCE_OP_MUL: {
       if (brw_type_is_int(type)) {
          id = size < 4  ? brw_imm_uw(1) :
               size == 4 ? brw_imm_ud(1) :
                           brw_imm_u64(1);
       } else {
          assert(brw_type_is_float(type));
          id = size == 2 ? brw_imm_uw(_mesa_float_to_half(1.0)) :
               size == 4 ? brw_imm_f(1.0) :
                           brw_imm_df(1.0);
       }
       break;
    }

    case BRW_REDUCE_OP_MIN: {
       if (brw_type_is_uint(type)) {
          id = brw_imm_u64(~0ull);
       } else if (brw_type_is_sint(type)) {
          id = size == 1 ? brw_imm_w(INT8_MAX) :
               size == 2 ? brw_imm_w(INT16_MAX) :
               size == 4 ? brw_imm_d(INT32_MAX) :
                           brw_imm_q(INT64_MAX);
       } else {
          assert(brw_type_is_float(type));
          id = size == 2 ? brw_imm_uw(_mesa_float_to_half(INFINITY)) :
               size == 4 ? brw_imm_f(INFINITY) :
                           brw_imm_df(INFINITY);
       }
       break;
    }

    case BRW_REDUCE_OP_MAX: {
       if (brw_type_is_uint(type)) {
          id = brw_imm_u64(0);
       } else if (brw_type_is_sint(type)) {
          id = size == 1 ? brw_imm_w(INT8_MIN) :
               size == 2 ? brw_imm_w(INT16_MIN) :
               size == 4 ? brw_imm_d(INT32_MIN) :
                           brw_imm_q(INT64_MIN);
       } else {
          assert(brw_type_is_float(type));
          id = size == 2 ? brw_imm_uw(_mesa_float_to_half(-INFINITY)) :
               size == 4 ? brw_imm_f(-INFINITY) :
                           brw_imm_df(-INFINITY);
       }
       break;
    }

    default:
       unreachable("invalid reduce op");
    }

    /* For some cases above (e.g. all bits zeros, all bits ones, first bit one)
     * either the size or the signedness was ignored, so adjust the final type
     * now.
     *
     * B/UB types can't have immediates, so used W/UW above and here.
     */
    if      (type == BRW_TYPE_UB) type = BRW_TYPE_UW;
    else if (type == BRW_TYPE_B)  type = BRW_TYPE_W;

    info.identity = retype(id, type);

    return info;
 }

 static void
 brw_emit_scan_step(const brw_builder &bld, enum opcode opcode, brw_conditional_mod mod,
                    const brw_reg &tmp,
                    unsigned left_offset, unsigned left_stride,
                    unsigned right_offset, unsigned right_stride)
 {
    brw_reg left, right;
    left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
    right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
    if ((tmp.type == BRW_TYPE_Q || tmp.type == BRW_TYPE_UQ) &&
        (!bld.shader->devinfo->has_64bit_int || bld.shader->devinfo->ver >= 20)) {
       switch (opcode) {
       case BRW_OPCODE_MUL:
          /* This will get lowered by integer MUL lowering */
          set_condmod(mod, bld.emit(opcode, right, left, right));
          break;

       case BRW_OPCODE_SEL: {
          /* In order for the comparisons to work out right, we need our
           * comparisons to be strict.
           */
          assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE);
          if (mod == BRW_CONDITIONAL_GE)
             mod = BRW_CONDITIONAL_G;

          /* We treat the bottom 32 bits as unsigned regardless of
           * whether or not the integer as a whole is signed.
           */
          brw_reg right_low = subscript(right, BRW_TYPE_UD, 0);
          brw_reg left_low = subscript(left, BRW_TYPE_UD, 0);

          /* The upper bits get the same sign as the 64-bit type */
          brw_reg_type type32 = brw_type_with_size(tmp.type, 32);
          brw_reg right_high = subscript(right, type32, 1);
          brw_reg left_high = subscript(left, type32, 1);

          /* Build up our comparison:
           *
           *   l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
           */
          bld.CMP(bld.null_reg_ud(), retype(left_low, BRW_TYPE_UD),
                             retype(right_low, BRW_TYPE_UD), mod);
          set_predicate(BRW_PREDICATE_NORMAL,
                        bld.CMP(bld.null_reg_ud(), left_high, right_high,
                            BRW_CONDITIONAL_EQ));
          set_predicate_inv(BRW_PREDICATE_NORMAL, true,
                            bld.CMP(bld.null_reg_ud(), left_high, right_high, mod));

          /* We could use selects here or we could use predicated MOVs
           * because the destination and second source (if it were a SEL)
           * are the same.
           */
          set_predicate(BRW_PREDICATE_NORMAL, bld.MOV(right_low, left_low));
          set_predicate(BRW_PREDICATE_NORMAL, bld.MOV(right_high, left_high));
          break;
       }

       default:
          unreachable("Unsupported 64-bit scan op");
       }
    } else {
       set_condmod(mod, bld.emit(opcode, right, left, right));
    }
 }

 static void
 brw_emit_scan(const brw_builder &bld, enum opcode opcode, const brw_reg &tmp,
               unsigned cluster_size, brw_conditional_mod mod)
 {
    unsigned dispatch_width = bld.dispatch_width();
    assert(dispatch_width >= 8);

    /* The instruction splitting code isn't advanced enough to split
     * these so we need to handle that ourselves.
     */
    if (dispatch_width * brw_type_size_bytes(tmp.type) > 2 * REG_SIZE) {
       const unsigned half_width = dispatch_width / 2;
       const brw_builder ubld = bld.exec_all().group(half_width, 0);
       brw_reg left = tmp;
       brw_reg right = horiz_offset(tmp, half_width);
       brw_emit_scan(ubld, opcode, left, cluster_size, mod);
       brw_emit_scan(ubld, opcode, right, cluster_size, mod);
       if (cluster_size > half_width) {
          brw_emit_scan_step(ubld, opcode, mod, tmp,
                             half_width - 1, 0, half_width, 1);
       }
       return;
    }

    if (cluster_size > 1) {
       const brw_builder ubld = bld.exec_all().group(dispatch_width / 2, 0);
       brw_emit_scan_step(ubld, opcode, mod, tmp, 0, 2, 1, 2);
    }

    if (cluster_size > 2) {
       if (brw_type_size_bytes(tmp.type) <= 4) {
          const brw_builder ubld =
             bld.exec_all().group(dispatch_width / 4, 0);
          brw_emit_scan_step(ubld, opcode, mod, tmp, 1, 4, 2, 4);
          brw_emit_scan_step(ubld, opcode, mod, tmp, 1, 4, 3, 4);
       } else {
          /* For 64-bit types, we have to do things differently because
           * the code above would land us with destination strides that
           * the hardware can't handle.  Fortunately, we'll only be
           * 8-wide in that case and it's the same number of
           * instructions.
           */
          const brw_builder ubld = bld.exec_all().group(2, 0);
          for (unsigned i = 0; i < dispatch_width; i += 4)
             brw_emit_scan_step(ubld, opcode, mod, tmp, i + 1, 0, i + 2, 1);
       }
    }

    for (unsigned i = 4;
         i < MIN2(cluster_size, dispatch_width);
         i *= 2) {
       const brw_builder ubld = bld.exec_all().group(i, 0);
       brw_emit_scan_step(ubld, opcode, mod, tmp, i - 1, 0, i, 1);

       if (dispatch_width > i * 2)
          brw_emit_scan_step(ubld, opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);

       if (dispatch_width > i * 4) {
          brw_emit_scan_step(ubld, opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
          brw_emit_scan_step(ubld, opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
       }
    }
 }

 static bool
 brw_lower_reduce(brw_shader &s, brw_inst *inst)
 {
    const brw_builder bld(inst);

    assert(inst->dst.type == inst->src[0].type);
    brw_reg dst = inst->dst;
    brw_reg src = inst->src[0];

    assert(inst->src[1].file == IMM);
    enum brw_reduce_op op = (enum brw_reduce_op)inst->src[1].ud;

    assert(inst->src[2].file == IMM);
    unsigned cluster_size = inst->src[2].ud;

    assert(cluster_size > 0);
    assert(cluster_size <= s.dispatch_width);

    struct brw_reduction_info info = brw_get_reduction_info(op, src.type);

    /* Set up a register for all of our scratching around and initialize it
     * to reduction operation's identity value.
     */
    brw_reg scan = bld.vgrf(src.type);
    bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, info.identity);

    brw_emit_scan(bld, info.op, scan, cluster_size, info.cond_mod);

    if (cluster_size * brw_type_size_bytes(src.type) >= REG_SIZE * 2) {
       /* In this case, CLUSTER_BROADCAST instruction isn't needed because
        * the distance between clusters is at least 2 GRFs.  In this case,
        * we don't need the weird striding of the CLUSTER_BROADCAST
        * instruction and can just do regular MOVs.
        */
       assert((cluster_size * brw_type_size_bytes(src.type)) % (REG_SIZE * 2) == 0);
       const unsigned groups =
          (s.dispatch_width * brw_type_size_bytes(src.type)) / (REG_SIZE * 2);
       const unsigned group_size = s.dispatch_width / groups;
       for (unsigned i = 0; i < groups; i++) {
          const unsigned cluster = (i * group_size) / cluster_size;
          const unsigned comp = cluster * cluster_size + (cluster_size - 1);
          bld.group(group_size, i).MOV(horiz_offset(dst, i * group_size),
                                       component(scan, comp));
       }
    } else {
       bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dst, scan,
                brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size));
    }
    inst->remove();
    return true;
 }

 static bool
 brw_lower_scan(brw_shader &s, brw_inst *inst)
 {
    const brw_builder bld(inst);

    assert(inst->dst.type == inst->src[0].type);
    brw_reg dst = inst->dst;
    brw_reg src = inst->src[0];

    assert(inst->src[1].file == IMM);
    enum brw_reduce_op op = (enum brw_reduce_op)inst->src[1].ud;

    struct brw_reduction_info info = brw_get_reduction_info(op, src.type);

    /* Set up a register for all of our scratching around and initialize it
     * to reduction operation's identity value.
     */
    brw_reg scan = bld.vgrf(src.type);
    const brw_builder ubld = bld.exec_all();
    ubld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, info.identity);

    if (inst->opcode == SHADER_OPCODE_EXCLUSIVE_SCAN) {
       /* Exclusive scan is a bit harder because we have to do an annoying
        * shift of the contents before we can begin.  To make things worse,
        * we can't do this with a normal stride; we have to use indirects.
        */
       brw_reg shifted = bld.vgrf(src.type);
       brw_reg idx = bld.vgrf(BRW_TYPE_UW);

       /* Set the saturate modifier in the offset index to ensure it's
        * normalized within the expected range without negative values,
        * since the situation can cause us to read past the end of the
        * register file leading to hangs on Xe3.
        */
       set_saturate(true, ubld.ADD(idx, bld.LOAD_SUBGROUP_INVOCATION(),
                                   brw_imm_w(-1)));
       ubld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
       ubld.group(1, 0).MOV(horiz_offset(shifted, 0), info.identity);
       scan = shifted;
    }

    brw_emit_scan(bld, info.op, scan, s.dispatch_width, info.cond_mod);

    bld.MOV(dst, scan);

    inst->remove();
    return true;
 }

 static brw_reg
 brw_fill_flag(const brw_builder &bld, unsigned v)
 {
    const brw_builder ubld1 = bld.uniform();
    brw_reg flag = brw_flag_reg(0, 0);

    if (bld.shader->dispatch_width == 32) {
       /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
       flag = retype(flag, BRW_TYPE_UD);
       ubld1.MOV(flag, brw_imm_ud(v));
    } else {
       ubld1.MOV(flag, brw_imm_uw(v & 0xFFFF));
    }

    return flag;
 }

 static void
 brw_lower_dispatch_width_vote(const brw_builder &bld, enum opcode opcode, brw_reg dst, brw_reg src)
 {
    const intel_device_info *devinfo = bld.shader->devinfo;
    const unsigned dispatch_width = bld.shader->dispatch_width;

    assert(opcode == SHADER_OPCODE_VOTE_ANY ||
           opcode == SHADER_OPCODE_VOTE_ALL ||
           opcode == SHADER_OPCODE_VOTE_EQUAL);

    const bool any   = opcode == SHADER_OPCODE_VOTE_ANY;
    const bool equal = opcode == SHADER_OPCODE_VOTE_EQUAL;

    const brw_reg ref = equal ? bld.emit_uniformize(src) : brw_imm_d(0);

    /* The any/all predicates do not consider channel enables. To prevent
     * dead channels from affecting the result, we initialize the flag with
     * with the identity value for the logical operation.
     */
    brw_fill_flag(bld, any ? 0 : 0xFFFFFFFF);
    bld.CMP(bld.null_reg_d(), src, ref, equal ? BRW_CONDITIONAL_Z
                                              : BRW_CONDITIONAL_NZ);

    /* For some reason, the any/all predicates don't work properly with
     * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
     * doesn't read the correct subset of the flag register and you end up
     * getting garbage in the second half.  Work around this by using a pair
     * of 1-wide MOVs and scattering the result.
     *
     * TODO: Check if we still need this for newer platforms.
     */
    const brw_builder ubld = devinfo->ver >= 20 ? bld.exec_all()
                                                : bld.uniform();
    brw_reg res1 = ubld.MOV(brw_imm_d(0));

    enum brw_predicate pred;
    if (any) {
       pred = devinfo->ver >= 20   ? XE2_PREDICATE_ANY :
              dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ANY8H :
              dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
                                     BRW_PREDICATE_ALIGN1_ANY32H;
    } else {
       pred = devinfo->ver >= 20   ? XE2_PREDICATE_ALL :
              dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
              dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
                                     BRW_PREDICATE_ALIGN1_ALL32H;
    }
    set_predicate(pred, ubld.MOV(res1, brw_imm_d(-1)));

    bld.MOV(retype(dst, BRW_TYPE_D), component(res1, 0));
 }

 static void
 brw_lower_quad_vote_gfx9(const brw_builder &bld, enum opcode opcode, brw_reg dst, brw_reg src)
 {
    assert(opcode == SHADER_OPCODE_VOTE_ANY || opcode == SHADER_OPCODE_VOTE_ALL);
    const bool any = opcode == SHADER_OPCODE_VOTE_ANY;

    /* The any/all predicates do not consider channel enables. To prevent
     * dead channels from affecting the result, we initialize the flag with
     * with the identity value for the logical operation.
     */
    brw_fill_flag(bld, any ? 0 : 0xFFFFFFFF);
    bld.CMP(bld.null_reg_ud(), src, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
    bld.exec_all().MOV(retype(dst, BRW_TYPE_UD), brw_imm_ud(0));

    /* Before Xe2, we can use specialized predicates. */
    const enum brw_predicate pred = any ? BRW_PREDICATE_ALIGN1_ANY4H
                                        : BRW_PREDICATE_ALIGN1_ALL4H;

    brw_inst *mov = bld.MOV(retype(dst, BRW_TYPE_D), brw_imm_d(-1));
    set_predicate(pred, mov);
 }

 static void
 brw_lower_quad_vote_gfx20(const brw_builder &bld, enum opcode opcode, brw_reg dst, brw_reg src)
 {
    assert(opcode == SHADER_OPCODE_VOTE_ANY || opcode == SHADER_OPCODE_VOTE_ALL);
    const bool any = opcode == SHADER_OPCODE_VOTE_ANY;

    /* This code is going to manipulate the results of flag mask, so clear it to
     * avoid any residual value from disabled channels.
     */
    brw_reg flag = brw_fill_flag(bld, 0);

    /* Mask of invocations where condition is true, note that mask is
     * replicated to each invocation.
     */
    bld.CMP(bld.null_reg_ud(), src, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
    brw_reg cond_mask = bld.vgrf(BRW_TYPE_UD);
    bld.MOV(cond_mask, flag);

    /* Mask of invocations in the quad, each invocation will get
     * all the bits set for their quad, i.e. invocations 0-3 will have
     * 0b...1111, invocations 4-7 will have 0b...11110000 and so on.
     */
    brw_reg invoc_ud = bld.vgrf(BRW_TYPE_UD);
    bld.MOV(invoc_ud, bld.LOAD_SUBGROUP_INVOCATION());
    brw_reg quad_mask =
       bld.SHL(brw_imm_ud(0xF), bld.AND(invoc_ud, brw_imm_ud(0xFFFFFFFC)));

    /* An invocation will have bits set for each quad that passes the
     * condition.  This is uniform among each quad.
     */
    brw_reg tmp = bld.AND(cond_mask, quad_mask);

    if (any) {
       bld.CMP(retype(dst, BRW_TYPE_UD), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
    } else {
       /* Filter out quad_mask to include only active channels. */
       brw_reg active = bld.vgrf(BRW_TYPE_UD);
       bld.exec_all().emit(SHADER_OPCODE_LOAD_LIVE_CHANNELS, active);
       bld.MOV(active, brw_reg(component(active, 0)));
       bld.AND(quad_mask, quad_mask, active);

       bld.CMP(retype(dst, BRW_TYPE_UD), tmp, quad_mask, BRW_CONDITIONAL_Z);
    }
 }

 static bool
 brw_lower_vote(brw_shader &s, brw_inst *inst)
 {
    const brw_builder bld(inst);

    brw_reg dst = inst->dst;
    brw_reg src = inst->src[0];

    unsigned cluster_size;
    if (inst->sources > 1) {
       assert(inst->src[1].file == IMM);
       cluster_size = inst->src[1].ud;
    } else {
       cluster_size = s.dispatch_width;
    }

    if (cluster_size == s.dispatch_width) {
       brw_lower_dispatch_width_vote(bld, inst->opcode, dst, src);
    } else {
       assert(cluster_size == 4);
       if (s.devinfo->ver < 20)
          brw_lower_quad_vote_gfx9(bld, inst->opcode, dst, src);
       else
          brw_lower_quad_vote_gfx20(bld, inst->opcode, dst, src);
    }

    inst->remove();
    return true;
 }

 static bool
 brw_lower_ballot(brw_shader &s, brw_inst *inst)
 {
    const brw_builder bld(inst);

    brw_reg value = retype(inst->src[0], BRW_TYPE_UD);
    brw_reg dst = inst->dst;

    const brw_builder xbld = dst.is_scalar ? bld.scalar_group() : bld;

    if (value.file == IMM) {
       /* Implement a fast-path for ballot(true). */
       if (!value.is_zero()) {
          brw_reg tmp = bld.vgrf(BRW_TYPE_UD);
          bld.exec_all().emit(SHADER_OPCODE_LOAD_LIVE_CHANNELS, tmp);
          xbld.MOV(dst, brw_reg(component(tmp, 0)));
       } else {
          brw_reg zero = retype(brw_imm_uq(0), dst.type);
          xbld.MOV(dst, zero);
       }
    } else {
       brw_reg flag = brw_fill_flag(bld, 0);
       bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
       xbld.MOV(dst, flag);
    }

    inst->remove();
    return true;
 }

 static bool
 brw_lower_quad_swap(brw_shader &s, brw_inst *inst)
 {
    const brw_builder bld(inst);

    assert(inst->dst.type == inst->src[0].type);
    brw_reg dst = inst->dst;
    brw_reg value = inst->src[0];

    assert(inst->src[1].file == IMM);
    enum brw_swap_direction dir = (enum brw_swap_direction)inst->src[1].ud;

    switch (dir) {
    case BRW_SWAP_HORIZONTAL: {
       const brw_reg tmp = bld.vgrf(value.type);

       const brw_builder ubld = bld.exec_all().group(s.dispatch_width / 2, 0);

       const brw_reg src_left = horiz_stride(value, 2);
       const brw_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
       const brw_reg tmp_left = horiz_stride(tmp, 2);
       const brw_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);

       ubld.MOV(tmp_left, src_right);
       ubld.MOV(tmp_right, src_left);

       bld.MOV(retype(dst, value.type), tmp);
       break;
    }
    case BRW_SWAP_VERTICAL:
    case BRW_SWAP_DIAGONAL: {
       if (brw_type_size_bits(value.type) == 32) {
          /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
          const unsigned swizzle = dir == BRW_SWAP_VERTICAL ? BRW_SWIZZLE4(2,3,0,1)
                                                            : BRW_SWIZZLE4(3,2,1,0);
          const brw_reg tmp = bld.vgrf(value.type);
          const brw_builder ubld = bld.exec_all();
          ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, brw_imm_ud(swizzle));
          bld.MOV(dst, tmp);
       } else {
          /* For larger data types, we have to either emit dispatch_width many
           * MOVs or else fall back to doing indirects.
           */
          const unsigned xor_mask = dir == BRW_SWAP_VERTICAL ? 0x2 : 0x3;
          brw_reg idx = bld.vgrf(BRW_TYPE_W);
          bld.XOR(idx, bld.LOAD_SUBGROUP_INVOCATION(), brw_imm_w(xor_mask));
          bld.emit(SHADER_OPCODE_SHUFFLE, dst, value, idx);
       }
       break;
    }
    }

    inst->remove();
    return true;
 }

 static bool
 brw_lower_read_from_live_channel(brw_shader &s, brw_inst *inst)
 {
    const brw_builder bld(inst);

    assert(inst->sources == 1);
    assert(inst->dst.type == inst->src[0].type);
    brw_reg dst = inst->dst;
    brw_reg value = inst->src[0];

    bld.MOV(dst, bld.emit_uniformize(value));

    inst->remove();
    return true;
 }

 static bool
 brw_lower_read_from_channel(brw_shader &s, brw_inst *inst)
 {
    const brw_builder bld(inst);

    assert(inst->sources == 2);
    assert(inst->dst.type == inst->src[0].type);

    brw_reg dst = inst->dst;
    brw_reg value = inst->src[0];
    brw_reg index = retype(inst->src[1], BRW_TYPE_UD);

    /* When for some reason the subgroup_size picked by NIR is larger than
     * the dispatch size picked by the backend (this could happen in RT,
     * FS), bound the invocation to the dispatch size.
     */
    const unsigned dispatch_width_mask = s.dispatch_width - 1;

    if (index.file == IMM) {
       /* Always apply mask here since it is cheap. */
       bld.MOV(dst, component(value, index.ud & dispatch_width_mask));
    } else {
       if (s.api_subgroup_size == 0 || s.dispatch_width < s.api_subgroup_size)
          index = bld.AND(index, brw_imm_ud(dispatch_width_mask));

       brw_reg tmp = bld.BROADCAST(value, bld.emit_uniformize(index));
       bld.MOV(dst, tmp);
    }

    inst->remove();
    return true;
 }

 bool
 brw_lower_subgroup_ops(brw_shader &s)
 {
    bool progress = false;

    foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
       switch (inst->opcode) {
       case SHADER_OPCODE_REDUCE:
          progress |= brw_lower_reduce(s, inst);
          break;

       case SHADER_OPCODE_INCLUSIVE_SCAN:
       case SHADER_OPCODE_EXCLUSIVE_SCAN:
          progress |= brw_lower_scan(s, inst);
          break;

       case SHADER_OPCODE_VOTE_ANY:
       case SHADER_OPCODE_VOTE_ALL:
       case SHADER_OPCODE_VOTE_EQUAL:
          progress |= brw_lower_vote(s, inst);
          break;

       case SHADER_OPCODE_BALLOT:
          progress |= brw_lower_ballot(s, inst);
          break;

       case SHADER_OPCODE_QUAD_SWAP:
          progress |= brw_lower_quad_swap(s, inst);
          break;

       case SHADER_OPCODE_READ_FROM_LIVE_CHANNEL:
          progress |= brw_lower_read_from_live_channel(s, inst);
          break;

       case SHADER_OPCODE_READ_FROM_CHANNEL:
          progress |= brw_lower_read_from_channel(s, inst);
          break;

       default:
          /* Nothing to do. */
          break;
       }
    }

    if (progress)
       s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
                             BRW_DEPENDENCY_VARIABLES);

    return progress;
 }
	/*
	* Copyright 2024 Intel Corporation
	* SPDX-License-Identifier: MIT
	*/

	#include <stdint.h>
	#include "util/half_float.h"

	#include "brw_shader.h"
	#include "brw_builder.h"

	struct brw_reduction_info {
	brw_reg identity;
	enum opcode op;
	brw_conditional_mod cond_mod;
	};

	static brw_reduction_info
	brw_get_reduction_info(brw_reduce_op red_op, brw_reg_type type)
	{
	struct brw_reduction_info info;

	info.op = BRW_OPCODE_SEL;
	info.cond_mod = BRW_CONDITIONAL_NONE;

	switch (red_op) {
	case BRW_REDUCE_OP_ADD: info.op = BRW_OPCODE_ADD; break;
	case BRW_REDUCE_OP_MUL: info.op = BRW_OPCODE_MUL; break;
	case BRW_REDUCE_OP_AND: info.op = BRW_OPCODE_AND; break;
	case BRW_REDUCE_OP_OR: info.op = BRW_OPCODE_OR; break;
	case BRW_REDUCE_OP_XOR: info.op = BRW_OPCODE_XOR; break;
	case BRW_REDUCE_OP_MIN: info.cond_mod = BRW_CONDITIONAL_L; break;
	case BRW_REDUCE_OP_MAX: info.cond_mod = BRW_CONDITIONAL_GE; break;
	default:
	unreachable("invalid reduce op");
	}

	switch (red_op) {
	case BRW_REDUCE_OP_ADD:
	case BRW_REDUCE_OP_XOR:
	case BRW_REDUCE_OP_OR:
	info.identity = retype(brw_imm_u64(0), type);
	return info;
	case BRW_REDUCE_OP_AND:
	info.identity = retype(brw_imm_u64(~0ull), type);
	return info;
	default:
	/* Continue below. */
	break;
	}

	brw_reg id;
	const unsigned size = brw_type_size_bytes(type);

	switch (red_op) {
	case BRW_REDUCE_OP_MUL: {
	if (brw_type_is_int(type)) {
	id = size < 4 ? brw_imm_uw(1) :
	size == 4 ? brw_imm_ud(1) :
	brw_imm_u64(1);
	} else {
	assert(brw_type_is_float(type));
	id = size == 2 ? brw_imm_uw(_mesa_float_to_half(1.0)) :
	size == 4 ? brw_imm_f(1.0) :
	brw_imm_df(1.0);
	}
	break;
	}

	case BRW_REDUCE_OP_MIN: {
	if (brw_type_is_uint(type)) {
	id = brw_imm_u64(~0ull);
	} else if (brw_type_is_sint(type)) {
	id = size == 1 ? brw_imm_w(INT8_MAX) :
	size == 2 ? brw_imm_w(INT16_MAX) :
	size == 4 ? brw_imm_d(INT32_MAX) :
	brw_imm_q(INT64_MAX);
	} else {
	assert(brw_type_is_float(type));
	id = size == 2 ? brw_imm_uw(_mesa_float_to_half(INFINITY)) :
	size == 4 ? brw_imm_f(INFINITY) :
	brw_imm_df(INFINITY);
	}
	break;
	}

	case BRW_REDUCE_OP_MAX: {
	if (brw_type_is_uint(type)) {
	id = brw_imm_u64(0);
	} else if (brw_type_is_sint(type)) {
	id = size == 1 ? brw_imm_w(INT8_MIN) :
	size == 2 ? brw_imm_w(INT16_MIN) :
	size == 4 ? brw_imm_d(INT32_MIN) :
	brw_imm_q(INT64_MIN);
	} else {
	assert(brw_type_is_float(type));
	id = size == 2 ? brw_imm_uw(_mesa_float_to_half(-INFINITY)) :
	size == 4 ? brw_imm_f(-INFINITY) :
	brw_imm_df(-INFINITY);
	}
	break;
	}

	default:
	unreachable("invalid reduce op");
	}

	/* For some cases above (e.g. all bits zeros, all bits ones, first bit one)
	* either the size or the signedness was ignored, so adjust the final type
	* now.
	*
	* B/UB types can't have immediates, so used W/UW above and here.
	*/
	if (type == BRW_TYPE_UB) type = BRW_TYPE_UW;
	else if (type == BRW_TYPE_B) type = BRW_TYPE_W;

	info.identity = retype(id, type);

	return info;
	}

	static void
	brw_emit_scan_step(const brw_builder &bld, enum opcode opcode, brw_conditional_mod mod,
	const brw_reg &tmp,
	unsigned left_offset, unsigned left_stride,
	unsigned right_offset, unsigned right_stride)
	{
	brw_reg left, right;
	left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
	right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
	if ((tmp.type == BRW_TYPE_Q \|\| tmp.type == BRW_TYPE_UQ) &&
	(!bld.shader->devinfo->has_64bit_int \|\| bld.shader->devinfo->ver >= 20)) {
	switch (opcode) {
	case BRW_OPCODE_MUL:
	/* This will get lowered by integer MUL lowering */
	set_condmod(mod, bld.emit(opcode, right, left, right));
	break;

	case BRW_OPCODE_SEL: {
	/* In order for the comparisons to work out right, we need our
	* comparisons to be strict.
	*/
	assert(mod == BRW_CONDITIONAL_L \|\| mod == BRW_CONDITIONAL_GE);
	if (mod == BRW_CONDITIONAL_GE)
	mod = BRW_CONDITIONAL_G;

	/* We treat the bottom 32 bits as unsigned regardless of
	* whether or not the integer as a whole is signed.
	*/
	brw_reg right_low = subscript(right, BRW_TYPE_UD, 0);
	brw_reg left_low = subscript(left, BRW_TYPE_UD, 0);

	/* The upper bits get the same sign as the 64-bit type */
	brw_reg_type type32 = brw_type_with_size(tmp.type, 32);
	brw_reg right_high = subscript(right, type32, 1);
	brw_reg left_high = subscript(left, type32, 1);

	/* Build up our comparison:
	*
	* l_hi < r_hi \|\| (l_hi == r_hi && l_low < r_low)
	*/
	bld.CMP(bld.null_reg_ud(), retype(left_low, BRW_TYPE_UD),
	retype(right_low, BRW_TYPE_UD), mod);
	set_predicate(BRW_PREDICATE_NORMAL,
	bld.CMP(bld.null_reg_ud(), left_high, right_high,
	BRW_CONDITIONAL_EQ));
	set_predicate_inv(BRW_PREDICATE_NORMAL, true,
	bld.CMP(bld.null_reg_ud(), left_high, right_high, mod));

	/* We could use selects here or we could use predicated MOVs
	* because the destination and second source (if it were a SEL)
	* are the same.
	*/
	set_predicate(BRW_PREDICATE_NORMAL, bld.MOV(right_low, left_low));
	set_predicate(BRW_PREDICATE_NORMAL, bld.MOV(right_high, left_high));
	break;
	}

	default:
	unreachable("Unsupported 64-bit scan op");
	}
	} else {
	set_condmod(mod, bld.emit(opcode, right, left, right));
	}
	}

	static void
	brw_emit_scan(const brw_builder &bld, enum opcode opcode, const brw_reg &tmp,
	unsigned cluster_size, brw_conditional_mod mod)
	{
	unsigned dispatch_width = bld.dispatch_width();
	assert(dispatch_width >= 8);

	/* The instruction splitting code isn't advanced enough to split
	* these so we need to handle that ourselves.
	*/
	if (dispatch_width * brw_type_size_bytes(tmp.type) > 2 * REG_SIZE) {
	const unsigned half_width = dispatch_width / 2;
	const brw_builder ubld = bld.exec_all().group(half_width, 0);
	brw_reg left = tmp;
	brw_reg right = horiz_offset(tmp, half_width);
	brw_emit_scan(ubld, opcode, left, cluster_size, mod);
	brw_emit_scan(ubld, opcode, right, cluster_size, mod);
	if (cluster_size > half_width) {
	brw_emit_scan_step(ubld, opcode, mod, tmp,
	half_width - 1, 0, half_width, 1);
	}
	return;
	}

	if (cluster_size > 1) {
	const brw_builder ubld = bld.exec_all().group(dispatch_width / 2, 0);
	brw_emit_scan_step(ubld, opcode, mod, tmp, 0, 2, 1, 2);
	}

	if (cluster_size > 2) {
	if (brw_type_size_bytes(tmp.type) <= 4) {
	const brw_builder ubld =
	bld.exec_all().group(dispatch_width / 4, 0);
	brw_emit_scan_step(ubld, opcode, mod, tmp, 1, 4, 2, 4);
	brw_emit_scan_step(ubld, opcode, mod, tmp, 1, 4, 3, 4);
	} else {
	/* For 64-bit types, we have to do things differently because
	* the code above would land us with destination strides that
	* the hardware can't handle. Fortunately, we'll only be
	* 8-wide in that case and it's the same number of
	* instructions.
	*/
	const brw_builder ubld = bld.exec_all().group(2, 0);
	for (unsigned i = 0; i < dispatch_width; i += 4)
	brw_emit_scan_step(ubld, opcode, mod, tmp, i + 1, 0, i + 2, 1);
	}
	}

	for (unsigned i = 4;
	i < MIN2(cluster_size, dispatch_width);
	i *= 2) {
	const brw_builder ubld = bld.exec_all().group(i, 0);
	brw_emit_scan_step(ubld, opcode, mod, tmp, i - 1, 0, i, 1);

	if (dispatch_width > i * 2)
	brw_emit_scan_step(ubld, opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);

	if (dispatch_width > i * 4) {
	brw_emit_scan_step(ubld, opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
	brw_emit_scan_step(ubld, opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
	}
	}
	}

	static bool
	brw_lower_reduce(brw_shader &s, brw_inst *inst)
	{
	const brw_builder bld(inst);

	assert(inst->dst.type == inst->src[0].type);
	brw_reg dst = inst->dst;
	brw_reg src = inst->src[0];

	assert(inst->src[1].file == IMM);
	enum brw_reduce_op op = (enum brw_reduce_op)inst->src[1].ud;

	assert(inst->src[2].file == IMM);
	unsigned cluster_size = inst->src[2].ud;

	assert(cluster_size > 0);
	assert(cluster_size <= s.dispatch_width);

	struct brw_reduction_info info = brw_get_reduction_info(op, src.type);

	/* Set up a register for all of our scratching around and initialize it
	* to reduction operation's identity value.
	*/
	brw_reg scan = bld.vgrf(src.type);
	bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, info.identity);

	brw_emit_scan(bld, info.op, scan, cluster_size, info.cond_mod);

	if (cluster_size * brw_type_size_bytes(src.type) >= REG_SIZE * 2) {
	/* In this case, CLUSTER_BROADCAST instruction isn't needed because
	* the distance between clusters is at least 2 GRFs. In this case,
	* we don't need the weird striding of the CLUSTER_BROADCAST
	* instruction and can just do regular MOVs.
	*/
	assert((cluster_size * brw_type_size_bytes(src.type)) % (REG_SIZE * 2) == 0);
	const unsigned groups =
	(s.dispatch_width * brw_type_size_bytes(src.type)) / (REG_SIZE * 2);
	const unsigned group_size = s.dispatch_width / groups;
	for (unsigned i = 0; i < groups; i++) {
	const unsigned cluster = (i * group_size) / cluster_size;
	const unsigned comp = cluster * cluster_size + (cluster_size - 1);
	bld.group(group_size, i).MOV(horiz_offset(dst, i * group_size),
	component(scan, comp));
	}
	} else {
	bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dst, scan,
	brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size));
	}
	inst->remove();
	return true;
	}

	static bool
	brw_lower_scan(brw_shader &s, brw_inst *inst)
	{
	const brw_builder bld(inst);

	assert(inst->dst.type == inst->src[0].type);
	brw_reg dst = inst->dst;
	brw_reg src = inst->src[0];

	assert(inst->src[1].file == IMM);
	enum brw_reduce_op op = (enum brw_reduce_op)inst->src[1].ud;

	struct brw_reduction_info info = brw_get_reduction_info(op, src.type);

	/* Set up a register for all of our scratching around and initialize it
	* to reduction operation's identity value.
	*/
	brw_reg scan = bld.vgrf(src.type);
	const brw_builder ubld = bld.exec_all();
	ubld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, info.identity);

	if (inst->opcode == SHADER_OPCODE_EXCLUSIVE_SCAN) {
	/* Exclusive scan is a bit harder because we have to do an annoying
	* shift of the contents before we can begin. To make things worse,
	* we can't do this with a normal stride; we have to use indirects.
	*/
	brw_reg shifted = bld.vgrf(src.type);
	brw_reg idx = bld.vgrf(BRW_TYPE_UW);

	/* Set the saturate modifier in the offset index to ensure it's
	* normalized within the expected range without negative values,
	* since the situation can cause us to read past the end of the
	* register file leading to hangs on Xe3.
	*/
	set_saturate(true, ubld.ADD(idx, bld.LOAD_SUBGROUP_INVOCATION(),
	brw_imm_w(-1)));
	ubld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
	ubld.group(1, 0).MOV(horiz_offset(shifted, 0), info.identity);
	scan = shifted;
	}

	brw_emit_scan(bld, info.op, scan, s.dispatch_width, info.cond_mod);

	bld.MOV(dst, scan);

	inst->remove();
	return true;
	}

	static brw_reg
	brw_fill_flag(const brw_builder &bld, unsigned v)
	{
	const brw_builder ubld1 = bld.uniform();
	brw_reg flag = brw_flag_reg(0, 0);

	if (bld.shader->dispatch_width == 32) {
	/* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
	flag = retype(flag, BRW_TYPE_UD);
	ubld1.MOV(flag, brw_imm_ud(v));
	} else {
	ubld1.MOV(flag, brw_imm_uw(v & 0xFFFF));
	}

	return flag;
	}

	static void
	brw_lower_dispatch_width_vote(const brw_builder &bld, enum opcode opcode, brw_reg dst, brw_reg src)
	{
	const intel_device_info *devinfo = bld.shader->devinfo;
	const unsigned dispatch_width = bld.shader->dispatch_width;

	assert(opcode == SHADER_OPCODE_VOTE_ANY \|\|
	opcode == SHADER_OPCODE_VOTE_ALL \|\|
	opcode == SHADER_OPCODE_VOTE_EQUAL);

	const bool any = opcode == SHADER_OPCODE_VOTE_ANY;
	const bool equal = opcode == SHADER_OPCODE_VOTE_EQUAL;

	const brw_reg ref = equal ? bld.emit_uniformize(src) : brw_imm_d(0);

	/* The any/all predicates do not consider channel enables. To prevent
	* dead channels from affecting the result, we initialize the flag with
	* with the identity value for the logical operation.
	*/
	brw_fill_flag(bld, any ? 0 : 0xFFFFFFFF);
	bld.CMP(bld.null_reg_d(), src, ref, equal ? BRW_CONDITIONAL_Z
	: BRW_CONDITIONAL_NZ);

	/* For some reason, the any/all predicates don't work properly with
	* SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H
	* doesn't read the correct subset of the flag register and you end up
	* getting garbage in the second half. Work around this by using a pair
	* of 1-wide MOVs and scattering the result.
	*
	* TODO: Check if we still need this for newer platforms.
	*/
	const brw_builder ubld = devinfo->ver >= 20 ? bld.exec_all()
	: bld.uniform();
	brw_reg res1 = ubld.MOV(brw_imm_d(0));

	enum brw_predicate pred;
	if (any) {
	pred = devinfo->ver >= 20 ? XE2_PREDICATE_ANY :
	dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ANY8H :
	dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
	BRW_PREDICATE_ALIGN1_ANY32H;
	} else {
	pred = devinfo->ver >= 20 ? XE2_PREDICATE_ALL :
	dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ALL8H :
	dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
	BRW_PREDICATE_ALIGN1_ALL32H;
	}
	set_predicate(pred, ubld.MOV(res1, brw_imm_d(-1)));

	bld.MOV(retype(dst, BRW_TYPE_D), component(res1, 0));
	}

	static void
	brw_lower_quad_vote_gfx9(const brw_builder &bld, enum opcode opcode, brw_reg dst, brw_reg src)
	{
	assert(opcode == SHADER_OPCODE_VOTE_ANY \|\| opcode == SHADER_OPCODE_VOTE_ALL);
	const bool any = opcode == SHADER_OPCODE_VOTE_ANY;

	/* The any/all predicates do not consider channel enables. To prevent
	* dead channels from affecting the result, we initialize the flag with
	* with the identity value for the logical operation.
	*/
	brw_fill_flag(bld, any ? 0 : 0xFFFFFFFF);
	bld.CMP(bld.null_reg_ud(), src, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
	bld.exec_all().MOV(retype(dst, BRW_TYPE_UD), brw_imm_ud(0));

	/* Before Xe2, we can use specialized predicates. */
	const enum brw_predicate pred = any ? BRW_PREDICATE_ALIGN1_ANY4H
	: BRW_PREDICATE_ALIGN1_ALL4H;

	brw_inst *mov = bld.MOV(retype(dst, BRW_TYPE_D), brw_imm_d(-1));
	set_predicate(pred, mov);
	}

	static void
	brw_lower_quad_vote_gfx20(const brw_builder &bld, enum opcode opcode, brw_reg dst, brw_reg src)
	{
	assert(opcode == SHADER_OPCODE_VOTE_ANY \|\| opcode == SHADER_OPCODE_VOTE_ALL);
	const bool any = opcode == SHADER_OPCODE_VOTE_ANY;

	/* This code is going to manipulate the results of flag mask, so clear it to
	* avoid any residual value from disabled channels.
	*/
	brw_reg flag = brw_fill_flag(bld, 0);

	/* Mask of invocations where condition is true, note that mask is
	* replicated to each invocation.
	*/
	bld.CMP(bld.null_reg_ud(), src, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
	brw_reg cond_mask = bld.vgrf(BRW_TYPE_UD);
	bld.MOV(cond_mask, flag);

	/* Mask of invocations in the quad, each invocation will get
	* all the bits set for their quad, i.e. invocations 0-3 will have
	* 0b...1111, invocations 4-7 will have 0b...11110000 and so on.
	*/
	brw_reg invoc_ud = bld.vgrf(BRW_TYPE_UD);
	bld.MOV(invoc_ud, bld.LOAD_SUBGROUP_INVOCATION());
	brw_reg quad_mask =
	bld.SHL(brw_imm_ud(0xF), bld.AND(invoc_ud, brw_imm_ud(0xFFFFFFFC)));

	/* An invocation will have bits set for each quad that passes the
	* condition. This is uniform among each quad.
	*/
	brw_reg tmp = bld.AND(cond_mask, quad_mask);

	if (any) {
	bld.CMP(retype(dst, BRW_TYPE_UD), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
	} else {
	/* Filter out quad_mask to include only active channels. */
	brw_reg active = bld.vgrf(BRW_TYPE_UD);
	bld.exec_all().emit(SHADER_OPCODE_LOAD_LIVE_CHANNELS, active);
	bld.MOV(active, brw_reg(component(active, 0)));
	bld.AND(quad_mask, quad_mask, active);

	bld.CMP(retype(dst, BRW_TYPE_UD), tmp, quad_mask, BRW_CONDITIONAL_Z);
	}
	}

	static bool
	brw_lower_vote(brw_shader &s, brw_inst *inst)
	{
	const brw_builder bld(inst);

	brw_reg dst = inst->dst;
	brw_reg src = inst->src[0];

	unsigned cluster_size;
	if (inst->sources > 1) {
	assert(inst->src[1].file == IMM);
	cluster_size = inst->src[1].ud;
	} else {
	cluster_size = s.dispatch_width;
	}

	if (cluster_size == s.dispatch_width) {
	brw_lower_dispatch_width_vote(bld, inst->opcode, dst, src);
	} else {
	assert(cluster_size == 4);
	if (s.devinfo->ver < 20)
	brw_lower_quad_vote_gfx9(bld, inst->opcode, dst, src);
	else
	brw_lower_quad_vote_gfx20(bld, inst->opcode, dst, src);
	}

	inst->remove();
	return true;
	}

	static bool
	brw_lower_ballot(brw_shader &s, brw_inst *inst)
	{
	const brw_builder bld(inst);

	brw_reg value = retype(inst->src[0], BRW_TYPE_UD);
	brw_reg dst = inst->dst;

	const brw_builder xbld = dst.is_scalar ? bld.scalar_group() : bld;

	if (value.file == IMM) {
	/* Implement a fast-path for ballot(true). */
	if (!value.is_zero()) {
	brw_reg tmp = bld.vgrf(BRW_TYPE_UD);
	bld.exec_all().emit(SHADER_OPCODE_LOAD_LIVE_CHANNELS, tmp);
	xbld.MOV(dst, brw_reg(component(tmp, 0)));
	} else {
	brw_reg zero = retype(brw_imm_uq(0), dst.type);
	xbld.MOV(dst, zero);
	}
	} else {
	brw_reg flag = brw_fill_flag(bld, 0);
	bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
	xbld.MOV(dst, flag);
	}

	inst->remove();
	return true;
	}

	static bool
	brw_lower_quad_swap(brw_shader &s, brw_inst *inst)
	{
	const brw_builder bld(inst);

	assert(inst->dst.type == inst->src[0].type);
	brw_reg dst = inst->dst;
	brw_reg value = inst->src[0];

	assert(inst->src[1].file == IMM);
	enum brw_swap_direction dir = (enum brw_swap_direction)inst->src[1].ud;

	switch (dir) {
	case BRW_SWAP_HORIZONTAL: {
	const brw_reg tmp = bld.vgrf(value.type);

	const brw_builder ubld = bld.exec_all().group(s.dispatch_width / 2, 0);

	const brw_reg src_left = horiz_stride(value, 2);
	const brw_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
	const brw_reg tmp_left = horiz_stride(tmp, 2);
	const brw_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);

	ubld.MOV(tmp_left, src_right);
	ubld.MOV(tmp_right, src_left);

	bld.MOV(retype(dst, value.type), tmp);
	break;
	}
	case BRW_SWAP_VERTICAL:
	case BRW_SWAP_DIAGONAL: {
	if (brw_type_size_bits(value.type) == 32) {
	/* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
	const unsigned swizzle = dir == BRW_SWAP_VERTICAL ? BRW_SWIZZLE4(2,3,0,1)
	: BRW_SWIZZLE4(3,2,1,0);
	const brw_reg tmp = bld.vgrf(value.type);
	const brw_builder ubld = bld.exec_all();
	ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, brw_imm_ud(swizzle));
	bld.MOV(dst, tmp);
	} else {
	/* For larger data types, we have to either emit dispatch_width many
	* MOVs or else fall back to doing indirects.
	*/
	const unsigned xor_mask = dir == BRW_SWAP_VERTICAL ? 0x2 : 0x3;
	brw_reg idx = bld.vgrf(BRW_TYPE_W);
	bld.XOR(idx, bld.LOAD_SUBGROUP_INVOCATION(), brw_imm_w(xor_mask));
	bld.emit(SHADER_OPCODE_SHUFFLE, dst, value, idx);
	}
	break;
	}
	}

	inst->remove();
	return true;
	}

	static bool
	brw_lower_read_from_live_channel(brw_shader &s, brw_inst *inst)
	{
	const brw_builder bld(inst);

	assert(inst->sources == 1);
	assert(inst->dst.type == inst->src[0].type);
	brw_reg dst = inst->dst;
	brw_reg value = inst->src[0];

	bld.MOV(dst, bld.emit_uniformize(value));

	inst->remove();
	return true;
	}

	static bool
	brw_lower_read_from_channel(brw_shader &s, brw_inst *inst)
	{
	const brw_builder bld(inst);

	assert(inst->sources == 2);
	assert(inst->dst.type == inst->src[0].type);

	brw_reg dst = inst->dst;
	brw_reg value = inst->src[0];
	brw_reg index = retype(inst->src[1], BRW_TYPE_UD);

	/* When for some reason the subgroup_size picked by NIR is larger than
	* the dispatch size picked by the backend (this could happen in RT,
	* FS), bound the invocation to the dispatch size.
	*/
	const unsigned dispatch_width_mask = s.dispatch_width - 1;

	if (index.file == IMM) {
	/* Always apply mask here since it is cheap. */
	bld.MOV(dst, component(value, index.ud & dispatch_width_mask));
	} else {
	if (s.api_subgroup_size == 0 \|\| s.dispatch_width < s.api_subgroup_size)
	index = bld.AND(index, brw_imm_ud(dispatch_width_mask));

	brw_reg tmp = bld.BROADCAST(value, bld.emit_uniformize(index));
	bld.MOV(dst, tmp);
	}

	inst->remove();
	return true;
	}

	bool
	brw_lower_subgroup_ops(brw_shader &s)
	{
	bool progress = false;

	foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
	switch (inst->opcode) {
	case SHADER_OPCODE_REDUCE:
	progress \|= brw_lower_reduce(s, inst);
	break;

	case SHADER_OPCODE_INCLUSIVE_SCAN:
	case SHADER_OPCODE_EXCLUSIVE_SCAN:
	progress \|= brw_lower_scan(s, inst);
	break;

	case SHADER_OPCODE_VOTE_ANY:
	case SHADER_OPCODE_VOTE_ALL:
	case SHADER_OPCODE_VOTE_EQUAL:
	progress \|= brw_lower_vote(s, inst);
	break;

	case SHADER_OPCODE_BALLOT:
	progress \|= brw_lower_ballot(s, inst);
	break;

	case SHADER_OPCODE_QUAD_SWAP:
	progress \|= brw_lower_quad_swap(s, inst);
	break;

	case SHADER_OPCODE_READ_FROM_LIVE_CHANNEL:
	progress \|= brw_lower_read_from_live_channel(s, inst);
	break;

	case SHADER_OPCODE_READ_FROM_CHANNEL:
	progress \|= brw_lower_read_from_channel(s, inst);
	break;

	default:
	/* Nothing to do. */
	break;
	}
	}

	if (progress)
	s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS \|
	BRW_DEPENDENCY_VARIABLES);

	return progress;
	}