| /* |
| * Copyright 2024 Intel Corporation |
| * SPDX-License-Identifier: MIT |
| */ |
| |
| #include <stdint.h> |
| #include "util/half_float.h" |
| |
| #include "brw_shader.h" |
| #include "brw_builder.h" |
| |
| struct brw_reduction_info { |
| brw_reg identity; |
| enum opcode op; |
| brw_conditional_mod cond_mod; |
| }; |
| |
| static brw_reduction_info |
| brw_get_reduction_info(brw_reduce_op red_op, brw_reg_type type) |
| { |
| struct brw_reduction_info info; |
| |
| info.op = BRW_OPCODE_SEL; |
| info.cond_mod = BRW_CONDITIONAL_NONE; |
| |
| switch (red_op) { |
| case BRW_REDUCE_OP_ADD: info.op = BRW_OPCODE_ADD; break; |
| case BRW_REDUCE_OP_MUL: info.op = BRW_OPCODE_MUL; break; |
| case BRW_REDUCE_OP_AND: info.op = BRW_OPCODE_AND; break; |
| case BRW_REDUCE_OP_OR: info.op = BRW_OPCODE_OR; break; |
| case BRW_REDUCE_OP_XOR: info.op = BRW_OPCODE_XOR; break; |
| case BRW_REDUCE_OP_MIN: info.cond_mod = BRW_CONDITIONAL_L; break; |
| case BRW_REDUCE_OP_MAX: info.cond_mod = BRW_CONDITIONAL_GE; break; |
| default: |
| unreachable("invalid reduce op"); |
| } |
| |
| switch (red_op) { |
| case BRW_REDUCE_OP_ADD: |
| case BRW_REDUCE_OP_XOR: |
| case BRW_REDUCE_OP_OR: |
| info.identity = retype(brw_imm_u64(0), type); |
| return info; |
| case BRW_REDUCE_OP_AND: |
| info.identity = retype(brw_imm_u64(~0ull), type); |
| return info; |
| default: |
| /* Continue below. */ |
| break; |
| } |
| |
| brw_reg id; |
| const unsigned size = brw_type_size_bytes(type); |
| |
| switch (red_op) { |
| case BRW_REDUCE_OP_MUL: { |
| if (brw_type_is_int(type)) { |
| id = size < 4 ? brw_imm_uw(1) : |
| size == 4 ? brw_imm_ud(1) : |
| brw_imm_u64(1); |
| } else { |
| assert(brw_type_is_float(type)); |
| id = size == 2 ? brw_imm_uw(_mesa_float_to_half(1.0)) : |
| size == 4 ? brw_imm_f(1.0) : |
| brw_imm_df(1.0); |
| } |
| break; |
| } |
| |
| case BRW_REDUCE_OP_MIN: { |
| if (brw_type_is_uint(type)) { |
| id = brw_imm_u64(~0ull); |
| } else if (brw_type_is_sint(type)) { |
| id = size == 1 ? brw_imm_w(INT8_MAX) : |
| size == 2 ? brw_imm_w(INT16_MAX) : |
| size == 4 ? brw_imm_d(INT32_MAX) : |
| brw_imm_q(INT64_MAX); |
| } else { |
| assert(brw_type_is_float(type)); |
| id = size == 2 ? brw_imm_uw(_mesa_float_to_half(INFINITY)) : |
| size == 4 ? brw_imm_f(INFINITY) : |
| brw_imm_df(INFINITY); |
| } |
| break; |
| } |
| |
| case BRW_REDUCE_OP_MAX: { |
| if (brw_type_is_uint(type)) { |
| id = brw_imm_u64(0); |
| } else if (brw_type_is_sint(type)) { |
| id = size == 1 ? brw_imm_w(INT8_MIN) : |
| size == 2 ? brw_imm_w(INT16_MIN) : |
| size == 4 ? brw_imm_d(INT32_MIN) : |
| brw_imm_q(INT64_MIN); |
| } else { |
| assert(brw_type_is_float(type)); |
| id = size == 2 ? brw_imm_uw(_mesa_float_to_half(-INFINITY)) : |
| size == 4 ? brw_imm_f(-INFINITY) : |
| brw_imm_df(-INFINITY); |
| } |
| break; |
| } |
| |
| default: |
| unreachable("invalid reduce op"); |
| } |
| |
| /* For some cases above (e.g. all bits zeros, all bits ones, first bit one) |
| * either the size or the signedness was ignored, so adjust the final type |
| * now. |
| * |
| * B/UB types can't have immediates, so used W/UW above and here. |
| */ |
| if (type == BRW_TYPE_UB) type = BRW_TYPE_UW; |
| else if (type == BRW_TYPE_B) type = BRW_TYPE_W; |
| |
| info.identity = retype(id, type); |
| |
| return info; |
| } |
| |
| static void |
| brw_emit_scan_step(const brw_builder &bld, enum opcode opcode, brw_conditional_mod mod, |
| const brw_reg &tmp, |
| unsigned left_offset, unsigned left_stride, |
| unsigned right_offset, unsigned right_stride) |
| { |
| brw_reg left, right; |
| left = horiz_stride(horiz_offset(tmp, left_offset), left_stride); |
| right = horiz_stride(horiz_offset(tmp, right_offset), right_stride); |
| if ((tmp.type == BRW_TYPE_Q || tmp.type == BRW_TYPE_UQ) && |
| (!bld.shader->devinfo->has_64bit_int || bld.shader->devinfo->ver >= 20)) { |
| switch (opcode) { |
| case BRW_OPCODE_MUL: |
| /* This will get lowered by integer MUL lowering */ |
| set_condmod(mod, bld.emit(opcode, right, left, right)); |
| break; |
| |
| case BRW_OPCODE_SEL: { |
| /* In order for the comparisons to work out right, we need our |
| * comparisons to be strict. |
| */ |
| assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE); |
| if (mod == BRW_CONDITIONAL_GE) |
| mod = BRW_CONDITIONAL_G; |
| |
| /* We treat the bottom 32 bits as unsigned regardless of |
| * whether or not the integer as a whole is signed. |
| */ |
| brw_reg right_low = subscript(right, BRW_TYPE_UD, 0); |
| brw_reg left_low = subscript(left, BRW_TYPE_UD, 0); |
| |
| /* The upper bits get the same sign as the 64-bit type */ |
| brw_reg_type type32 = brw_type_with_size(tmp.type, 32); |
| brw_reg right_high = subscript(right, type32, 1); |
| brw_reg left_high = subscript(left, type32, 1); |
| |
| /* Build up our comparison: |
| * |
| * l_hi < r_hi || (l_hi == r_hi && l_low < r_low) |
| */ |
| bld.CMP(bld.null_reg_ud(), retype(left_low, BRW_TYPE_UD), |
| retype(right_low, BRW_TYPE_UD), mod); |
| set_predicate(BRW_PREDICATE_NORMAL, |
| bld.CMP(bld.null_reg_ud(), left_high, right_high, |
| BRW_CONDITIONAL_EQ)); |
| set_predicate_inv(BRW_PREDICATE_NORMAL, true, |
| bld.CMP(bld.null_reg_ud(), left_high, right_high, mod)); |
| |
| /* We could use selects here or we could use predicated MOVs |
| * because the destination and second source (if it were a SEL) |
| * are the same. |
| */ |
| set_predicate(BRW_PREDICATE_NORMAL, bld.MOV(right_low, left_low)); |
| set_predicate(BRW_PREDICATE_NORMAL, bld.MOV(right_high, left_high)); |
| break; |
| } |
| |
| default: |
| unreachable("Unsupported 64-bit scan op"); |
| } |
| } else { |
| set_condmod(mod, bld.emit(opcode, right, left, right)); |
| } |
| } |
| |
| static void |
| brw_emit_scan(const brw_builder &bld, enum opcode opcode, const brw_reg &tmp, |
| unsigned cluster_size, brw_conditional_mod mod) |
| { |
| unsigned dispatch_width = bld.dispatch_width(); |
| assert(dispatch_width >= 8); |
| |
| /* The instruction splitting code isn't advanced enough to split |
| * these so we need to handle that ourselves. |
| */ |
| if (dispatch_width * brw_type_size_bytes(tmp.type) > 2 * REG_SIZE) { |
| const unsigned half_width = dispatch_width / 2; |
| const brw_builder ubld = bld.exec_all().group(half_width, 0); |
| brw_reg left = tmp; |
| brw_reg right = horiz_offset(tmp, half_width); |
| brw_emit_scan(ubld, opcode, left, cluster_size, mod); |
| brw_emit_scan(ubld, opcode, right, cluster_size, mod); |
| if (cluster_size > half_width) { |
| brw_emit_scan_step(ubld, opcode, mod, tmp, |
| half_width - 1, 0, half_width, 1); |
| } |
| return; |
| } |
| |
| if (cluster_size > 1) { |
| const brw_builder ubld = bld.exec_all().group(dispatch_width / 2, 0); |
| brw_emit_scan_step(ubld, opcode, mod, tmp, 0, 2, 1, 2); |
| } |
| |
| if (cluster_size > 2) { |
| if (brw_type_size_bytes(tmp.type) <= 4) { |
| const brw_builder ubld = |
| bld.exec_all().group(dispatch_width / 4, 0); |
| brw_emit_scan_step(ubld, opcode, mod, tmp, 1, 4, 2, 4); |
| brw_emit_scan_step(ubld, opcode, mod, tmp, 1, 4, 3, 4); |
| } else { |
| /* For 64-bit types, we have to do things differently because |
| * the code above would land us with destination strides that |
| * the hardware can't handle. Fortunately, we'll only be |
| * 8-wide in that case and it's the same number of |
| * instructions. |
| */ |
| const brw_builder ubld = bld.exec_all().group(2, 0); |
| for (unsigned i = 0; i < dispatch_width; i += 4) |
| brw_emit_scan_step(ubld, opcode, mod, tmp, i + 1, 0, i + 2, 1); |
| } |
| } |
| |
| for (unsigned i = 4; |
| i < MIN2(cluster_size, dispatch_width); |
| i *= 2) { |
| const brw_builder ubld = bld.exec_all().group(i, 0); |
| brw_emit_scan_step(ubld, opcode, mod, tmp, i - 1, 0, i, 1); |
| |
| if (dispatch_width > i * 2) |
| brw_emit_scan_step(ubld, opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1); |
| |
| if (dispatch_width > i * 4) { |
| brw_emit_scan_step(ubld, opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1); |
| brw_emit_scan_step(ubld, opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1); |
| } |
| } |
| } |
| |
| static bool |
| brw_lower_reduce(brw_shader &s, brw_inst *inst) |
| { |
| const brw_builder bld(inst); |
| |
| assert(inst->dst.type == inst->src[0].type); |
| brw_reg dst = inst->dst; |
| brw_reg src = inst->src[0]; |
| |
| assert(inst->src[1].file == IMM); |
| enum brw_reduce_op op = (enum brw_reduce_op)inst->src[1].ud; |
| |
| assert(inst->src[2].file == IMM); |
| unsigned cluster_size = inst->src[2].ud; |
| |
| assert(cluster_size > 0); |
| assert(cluster_size <= s.dispatch_width); |
| |
| struct brw_reduction_info info = brw_get_reduction_info(op, src.type); |
| |
| /* Set up a register for all of our scratching around and initialize it |
| * to reduction operation's identity value. |
| */ |
| brw_reg scan = bld.vgrf(src.type); |
| bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, info.identity); |
| |
| brw_emit_scan(bld, info.op, scan, cluster_size, info.cond_mod); |
| |
| if (cluster_size * brw_type_size_bytes(src.type) >= REG_SIZE * 2) { |
| /* In this case, CLUSTER_BROADCAST instruction isn't needed because |
| * the distance between clusters is at least 2 GRFs. In this case, |
| * we don't need the weird striding of the CLUSTER_BROADCAST |
| * instruction and can just do regular MOVs. |
| */ |
| assert((cluster_size * brw_type_size_bytes(src.type)) % (REG_SIZE * 2) == 0); |
| const unsigned groups = |
| (s.dispatch_width * brw_type_size_bytes(src.type)) / (REG_SIZE * 2); |
| const unsigned group_size = s.dispatch_width / groups; |
| for (unsigned i = 0; i < groups; i++) { |
| const unsigned cluster = (i * group_size) / cluster_size; |
| const unsigned comp = cluster * cluster_size + (cluster_size - 1); |
| bld.group(group_size, i).MOV(horiz_offset(dst, i * group_size), |
| component(scan, comp)); |
| } |
| } else { |
| bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dst, scan, |
| brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size)); |
| } |
| inst->remove(); |
| return true; |
| } |
| |
| static bool |
| brw_lower_scan(brw_shader &s, brw_inst *inst) |
| { |
| const brw_builder bld(inst); |
| |
| assert(inst->dst.type == inst->src[0].type); |
| brw_reg dst = inst->dst; |
| brw_reg src = inst->src[0]; |
| |
| assert(inst->src[1].file == IMM); |
| enum brw_reduce_op op = (enum brw_reduce_op)inst->src[1].ud; |
| |
| struct brw_reduction_info info = brw_get_reduction_info(op, src.type); |
| |
| /* Set up a register for all of our scratching around and initialize it |
| * to reduction operation's identity value. |
| */ |
| brw_reg scan = bld.vgrf(src.type); |
| const brw_builder ubld = bld.exec_all(); |
| ubld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, info.identity); |
| |
| if (inst->opcode == SHADER_OPCODE_EXCLUSIVE_SCAN) { |
| /* Exclusive scan is a bit harder because we have to do an annoying |
| * shift of the contents before we can begin. To make things worse, |
| * we can't do this with a normal stride; we have to use indirects. |
| */ |
| brw_reg shifted = bld.vgrf(src.type); |
| brw_reg idx = bld.vgrf(BRW_TYPE_UW); |
| |
| /* Set the saturate modifier in the offset index to ensure it's |
| * normalized within the expected range without negative values, |
| * since the situation can cause us to read past the end of the |
| * register file leading to hangs on Xe3. |
| */ |
| set_saturate(true, ubld.ADD(idx, bld.LOAD_SUBGROUP_INVOCATION(), |
| brw_imm_w(-1))); |
| ubld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx); |
| ubld.group(1, 0).MOV(horiz_offset(shifted, 0), info.identity); |
| scan = shifted; |
| } |
| |
| brw_emit_scan(bld, info.op, scan, s.dispatch_width, info.cond_mod); |
| |
| bld.MOV(dst, scan); |
| |
| inst->remove(); |
| return true; |
| } |
| |
| static brw_reg |
| brw_fill_flag(const brw_builder &bld, unsigned v) |
| { |
| const brw_builder ubld1 = bld.uniform(); |
| brw_reg flag = brw_flag_reg(0, 0); |
| |
| if (bld.shader->dispatch_width == 32) { |
| /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */ |
| flag = retype(flag, BRW_TYPE_UD); |
| ubld1.MOV(flag, brw_imm_ud(v)); |
| } else { |
| ubld1.MOV(flag, brw_imm_uw(v & 0xFFFF)); |
| } |
| |
| return flag; |
| } |
| |
| static void |
| brw_lower_dispatch_width_vote(const brw_builder &bld, enum opcode opcode, brw_reg dst, brw_reg src) |
| { |
| const intel_device_info *devinfo = bld.shader->devinfo; |
| const unsigned dispatch_width = bld.shader->dispatch_width; |
| |
| assert(opcode == SHADER_OPCODE_VOTE_ANY || |
| opcode == SHADER_OPCODE_VOTE_ALL || |
| opcode == SHADER_OPCODE_VOTE_EQUAL); |
| |
| const bool any = opcode == SHADER_OPCODE_VOTE_ANY; |
| const bool equal = opcode == SHADER_OPCODE_VOTE_EQUAL; |
| |
| const brw_reg ref = equal ? bld.emit_uniformize(src) : brw_imm_d(0); |
| |
| /* The any/all predicates do not consider channel enables. To prevent |
| * dead channels from affecting the result, we initialize the flag with |
| * with the identity value for the logical operation. |
| */ |
| brw_fill_flag(bld, any ? 0 : 0xFFFFFFFF); |
| bld.CMP(bld.null_reg_d(), src, ref, equal ? BRW_CONDITIONAL_Z |
| : BRW_CONDITIONAL_NZ); |
| |
| /* For some reason, the any/all predicates don't work properly with |
| * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H |
| * doesn't read the correct subset of the flag register and you end up |
| * getting garbage in the second half. Work around this by using a pair |
| * of 1-wide MOVs and scattering the result. |
| * |
| * TODO: Check if we still need this for newer platforms. |
| */ |
| const brw_builder ubld = devinfo->ver >= 20 ? bld.exec_all() |
| : bld.uniform(); |
| brw_reg res1 = ubld.MOV(brw_imm_d(0)); |
| |
| enum brw_predicate pred; |
| if (any) { |
| pred = devinfo->ver >= 20 ? XE2_PREDICATE_ANY : |
| dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ANY8H : |
| dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H : |
| BRW_PREDICATE_ALIGN1_ANY32H; |
| } else { |
| pred = devinfo->ver >= 20 ? XE2_PREDICATE_ALL : |
| dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ALL8H : |
| dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H : |
| BRW_PREDICATE_ALIGN1_ALL32H; |
| } |
| set_predicate(pred, ubld.MOV(res1, brw_imm_d(-1))); |
| |
| bld.MOV(retype(dst, BRW_TYPE_D), component(res1, 0)); |
| } |
| |
| static void |
| brw_lower_quad_vote_gfx9(const brw_builder &bld, enum opcode opcode, brw_reg dst, brw_reg src) |
| { |
| assert(opcode == SHADER_OPCODE_VOTE_ANY || opcode == SHADER_OPCODE_VOTE_ALL); |
| const bool any = opcode == SHADER_OPCODE_VOTE_ANY; |
| |
| /* The any/all predicates do not consider channel enables. To prevent |
| * dead channels from affecting the result, we initialize the flag with |
| * with the identity value for the logical operation. |
| */ |
| brw_fill_flag(bld, any ? 0 : 0xFFFFFFFF); |
| bld.CMP(bld.null_reg_ud(), src, brw_imm_ud(0u), BRW_CONDITIONAL_NZ); |
| bld.exec_all().MOV(retype(dst, BRW_TYPE_UD), brw_imm_ud(0)); |
| |
| /* Before Xe2, we can use specialized predicates. */ |
| const enum brw_predicate pred = any ? BRW_PREDICATE_ALIGN1_ANY4H |
| : BRW_PREDICATE_ALIGN1_ALL4H; |
| |
| brw_inst *mov = bld.MOV(retype(dst, BRW_TYPE_D), brw_imm_d(-1)); |
| set_predicate(pred, mov); |
| } |
| |
| static void |
| brw_lower_quad_vote_gfx20(const brw_builder &bld, enum opcode opcode, brw_reg dst, brw_reg src) |
| { |
| assert(opcode == SHADER_OPCODE_VOTE_ANY || opcode == SHADER_OPCODE_VOTE_ALL); |
| const bool any = opcode == SHADER_OPCODE_VOTE_ANY; |
| |
| /* This code is going to manipulate the results of flag mask, so clear it to |
| * avoid any residual value from disabled channels. |
| */ |
| brw_reg flag = brw_fill_flag(bld, 0); |
| |
| /* Mask of invocations where condition is true, note that mask is |
| * replicated to each invocation. |
| */ |
| bld.CMP(bld.null_reg_ud(), src, brw_imm_ud(0u), BRW_CONDITIONAL_NZ); |
| brw_reg cond_mask = bld.vgrf(BRW_TYPE_UD); |
| bld.MOV(cond_mask, flag); |
| |
| /* Mask of invocations in the quad, each invocation will get |
| * all the bits set for their quad, i.e. invocations 0-3 will have |
| * 0b...1111, invocations 4-7 will have 0b...11110000 and so on. |
| */ |
| brw_reg invoc_ud = bld.vgrf(BRW_TYPE_UD); |
| bld.MOV(invoc_ud, bld.LOAD_SUBGROUP_INVOCATION()); |
| brw_reg quad_mask = |
| bld.SHL(brw_imm_ud(0xF), bld.AND(invoc_ud, brw_imm_ud(0xFFFFFFFC))); |
| |
| /* An invocation will have bits set for each quad that passes the |
| * condition. This is uniform among each quad. |
| */ |
| brw_reg tmp = bld.AND(cond_mask, quad_mask); |
| |
| if (any) { |
| bld.CMP(retype(dst, BRW_TYPE_UD), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ); |
| } else { |
| /* Filter out quad_mask to include only active channels. */ |
| brw_reg active = bld.vgrf(BRW_TYPE_UD); |
| bld.exec_all().emit(SHADER_OPCODE_LOAD_LIVE_CHANNELS, active); |
| bld.MOV(active, brw_reg(component(active, 0))); |
| bld.AND(quad_mask, quad_mask, active); |
| |
| bld.CMP(retype(dst, BRW_TYPE_UD), tmp, quad_mask, BRW_CONDITIONAL_Z); |
| } |
| } |
| |
| static bool |
| brw_lower_vote(brw_shader &s, brw_inst *inst) |
| { |
| const brw_builder bld(inst); |
| |
| brw_reg dst = inst->dst; |
| brw_reg src = inst->src[0]; |
| |
| unsigned cluster_size; |
| if (inst->sources > 1) { |
| assert(inst->src[1].file == IMM); |
| cluster_size = inst->src[1].ud; |
| } else { |
| cluster_size = s.dispatch_width; |
| } |
| |
| if (cluster_size == s.dispatch_width) { |
| brw_lower_dispatch_width_vote(bld, inst->opcode, dst, src); |
| } else { |
| assert(cluster_size == 4); |
| if (s.devinfo->ver < 20) |
| brw_lower_quad_vote_gfx9(bld, inst->opcode, dst, src); |
| else |
| brw_lower_quad_vote_gfx20(bld, inst->opcode, dst, src); |
| } |
| |
| inst->remove(); |
| return true; |
| } |
| |
| static bool |
| brw_lower_ballot(brw_shader &s, brw_inst *inst) |
| { |
| const brw_builder bld(inst); |
| |
| brw_reg value = retype(inst->src[0], BRW_TYPE_UD); |
| brw_reg dst = inst->dst; |
| |
| const brw_builder xbld = dst.is_scalar ? bld.scalar_group() : bld; |
| |
| if (value.file == IMM) { |
| /* Implement a fast-path for ballot(true). */ |
| if (!value.is_zero()) { |
| brw_reg tmp = bld.vgrf(BRW_TYPE_UD); |
| bld.exec_all().emit(SHADER_OPCODE_LOAD_LIVE_CHANNELS, tmp); |
| xbld.MOV(dst, brw_reg(component(tmp, 0))); |
| } else { |
| brw_reg zero = retype(brw_imm_uq(0), dst.type); |
| xbld.MOV(dst, zero); |
| } |
| } else { |
| brw_reg flag = brw_fill_flag(bld, 0); |
| bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ); |
| xbld.MOV(dst, flag); |
| } |
| |
| inst->remove(); |
| return true; |
| } |
| |
| static bool |
| brw_lower_quad_swap(brw_shader &s, brw_inst *inst) |
| { |
| const brw_builder bld(inst); |
| |
| assert(inst->dst.type == inst->src[0].type); |
| brw_reg dst = inst->dst; |
| brw_reg value = inst->src[0]; |
| |
| assert(inst->src[1].file == IMM); |
| enum brw_swap_direction dir = (enum brw_swap_direction)inst->src[1].ud; |
| |
| switch (dir) { |
| case BRW_SWAP_HORIZONTAL: { |
| const brw_reg tmp = bld.vgrf(value.type); |
| |
| const brw_builder ubld = bld.exec_all().group(s.dispatch_width / 2, 0); |
| |
| const brw_reg src_left = horiz_stride(value, 2); |
| const brw_reg src_right = horiz_stride(horiz_offset(value, 1), 2); |
| const brw_reg tmp_left = horiz_stride(tmp, 2); |
| const brw_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2); |
| |
| ubld.MOV(tmp_left, src_right); |
| ubld.MOV(tmp_right, src_left); |
| |
| bld.MOV(retype(dst, value.type), tmp); |
| break; |
| } |
| case BRW_SWAP_VERTICAL: |
| case BRW_SWAP_DIAGONAL: { |
| if (brw_type_size_bits(value.type) == 32) { |
| /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */ |
| const unsigned swizzle = dir == BRW_SWAP_VERTICAL ? BRW_SWIZZLE4(2,3,0,1) |
| : BRW_SWIZZLE4(3,2,1,0); |
| const brw_reg tmp = bld.vgrf(value.type); |
| const brw_builder ubld = bld.exec_all(); |
| ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, brw_imm_ud(swizzle)); |
| bld.MOV(dst, tmp); |
| } else { |
| /* For larger data types, we have to either emit dispatch_width many |
| * MOVs or else fall back to doing indirects. |
| */ |
| const unsigned xor_mask = dir == BRW_SWAP_VERTICAL ? 0x2 : 0x3; |
| brw_reg idx = bld.vgrf(BRW_TYPE_W); |
| bld.XOR(idx, bld.LOAD_SUBGROUP_INVOCATION(), brw_imm_w(xor_mask)); |
| bld.emit(SHADER_OPCODE_SHUFFLE, dst, value, idx); |
| } |
| break; |
| } |
| } |
| |
| inst->remove(); |
| return true; |
| } |
| |
| static bool |
| brw_lower_read_from_live_channel(brw_shader &s, brw_inst *inst) |
| { |
| const brw_builder bld(inst); |
| |
| assert(inst->sources == 1); |
| assert(inst->dst.type == inst->src[0].type); |
| brw_reg dst = inst->dst; |
| brw_reg value = inst->src[0]; |
| |
| bld.MOV(dst, bld.emit_uniformize(value)); |
| |
| inst->remove(); |
| return true; |
| } |
| |
| static bool |
| brw_lower_read_from_channel(brw_shader &s, brw_inst *inst) |
| { |
| const brw_builder bld(inst); |
| |
| assert(inst->sources == 2); |
| assert(inst->dst.type == inst->src[0].type); |
| |
| brw_reg dst = inst->dst; |
| brw_reg value = inst->src[0]; |
| brw_reg index = retype(inst->src[1], BRW_TYPE_UD); |
| |
| /* When for some reason the subgroup_size picked by NIR is larger than |
| * the dispatch size picked by the backend (this could happen in RT, |
| * FS), bound the invocation to the dispatch size. |
| */ |
| const unsigned dispatch_width_mask = s.dispatch_width - 1; |
| |
| if (index.file == IMM) { |
| /* Always apply mask here since it is cheap. */ |
| bld.MOV(dst, component(value, index.ud & dispatch_width_mask)); |
| } else { |
| if (s.api_subgroup_size == 0 || s.dispatch_width < s.api_subgroup_size) |
| index = bld.AND(index, brw_imm_ud(dispatch_width_mask)); |
| |
| brw_reg tmp = bld.BROADCAST(value, bld.emit_uniformize(index)); |
| bld.MOV(dst, tmp); |
| } |
| |
| inst->remove(); |
| return true; |
| } |
| |
| bool |
| brw_lower_subgroup_ops(brw_shader &s) |
| { |
| bool progress = false; |
| |
| foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) { |
| switch (inst->opcode) { |
| case SHADER_OPCODE_REDUCE: |
| progress |= brw_lower_reduce(s, inst); |
| break; |
| |
| case SHADER_OPCODE_INCLUSIVE_SCAN: |
| case SHADER_OPCODE_EXCLUSIVE_SCAN: |
| progress |= brw_lower_scan(s, inst); |
| break; |
| |
| case SHADER_OPCODE_VOTE_ANY: |
| case SHADER_OPCODE_VOTE_ALL: |
| case SHADER_OPCODE_VOTE_EQUAL: |
| progress |= brw_lower_vote(s, inst); |
| break; |
| |
| case SHADER_OPCODE_BALLOT: |
| progress |= brw_lower_ballot(s, inst); |
| break; |
| |
| case SHADER_OPCODE_QUAD_SWAP: |
| progress |= brw_lower_quad_swap(s, inst); |
| break; |
| |
| case SHADER_OPCODE_READ_FROM_LIVE_CHANNEL: |
| progress |= brw_lower_read_from_live_channel(s, inst); |
| break; |
| |
| case SHADER_OPCODE_READ_FROM_CHANNEL: |
| progress |= brw_lower_read_from_channel(s, inst); |
| break; |
| |
| default: |
| /* Nothing to do. */ |
| break; |
| } |
| } |
| |
| if (progress) |
| s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS | |
| BRW_DEPENDENCY_VARIABLES); |
| |
| return progress; |
| } |