| /* |
| * Copyright © 2010, 2022 Intel Corporation |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| */ |
| |
| /** |
| * @file brw_lower_logical_sends.cpp |
| */ |
| |
| #include "brw_eu.h" |
| #include "brw_fs.h" |
| |
| using namespace brw; |
| |
| static void |
| lower_urb_read_logical_send(const fs_builder &bld, fs_inst *inst) |
| { |
| const intel_device_info *devinfo = bld.shader->devinfo; |
| const bool per_slot_present = |
| inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE; |
| |
| assert(inst->size_written % REG_SIZE == 0); |
| assert(inst->header_size == 0); |
| |
| fs_reg *payload_sources = new fs_reg[inst->mlen]; |
| fs_reg payload = fs_reg(VGRF, bld.shader->alloc.allocate(inst->mlen), |
| BRW_REGISTER_TYPE_F); |
| |
| unsigned header_size = 0; |
| payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE]; |
| if (per_slot_present) |
| payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS]; |
| |
| bld.LOAD_PAYLOAD(payload, payload_sources, inst->mlen, header_size); |
| |
| delete [] payload_sources; |
| |
| inst->opcode = SHADER_OPCODE_SEND; |
| inst->header_size = header_size; |
| |
| inst->sfid = BRW_SFID_URB; |
| inst->desc = brw_urb_desc(devinfo, |
| GFX8_URB_OPCODE_SIMD8_READ, |
| per_slot_present, |
| false, |
| inst->offset); |
| |
| inst->ex_desc = 0; |
| inst->ex_mlen = 0; |
| inst->send_is_volatile = true; |
| |
| inst->resize_sources(4); |
| |
| inst->src[0] = brw_imm_ud(0); /* desc */ |
| inst->src[1] = brw_imm_ud(0); /* ex_desc */ |
| inst->src[2] = payload; |
| inst->src[3] = brw_null_reg(); |
| } |
| |
| static void |
| lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst) |
| { |
| const intel_device_info *devinfo = bld.shader->devinfo; |
| const bool per_slot_present = |
| inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE; |
| const bool channel_mask_present = |
| inst->src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE; |
| |
| assert(inst->header_size == 0); |
| |
| fs_reg *payload_sources = new fs_reg[inst->mlen]; |
| fs_reg payload = fs_reg(VGRF, bld.shader->alloc.allocate(inst->mlen), |
| BRW_REGISTER_TYPE_F); |
| |
| unsigned header_size = 0; |
| payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE]; |
| if (per_slot_present) |
| payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS]; |
| |
| if (channel_mask_present) |
| payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK]; |
| |
| for (unsigned i = header_size, j = 0; i < inst->mlen; i++, j++) |
| payload_sources[i] = offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j); |
| |
| bld.LOAD_PAYLOAD(payload, payload_sources, inst->mlen, header_size); |
| |
| delete [] payload_sources; |
| |
| inst->opcode = SHADER_OPCODE_SEND; |
| inst->header_size = header_size; |
| inst->dst = brw_null_reg(); |
| |
| inst->sfid = BRW_SFID_URB; |
| inst->desc = brw_urb_desc(devinfo, |
| GFX8_URB_OPCODE_SIMD8_WRITE, |
| per_slot_present, |
| channel_mask_present, |
| inst->offset); |
| |
| inst->ex_desc = 0; |
| inst->ex_mlen = 0; |
| inst->send_has_side_effects = true; |
| |
| inst->resize_sources(4); |
| |
| inst->src[0] = brw_imm_ud(0); /* desc */ |
| inst->src[1] = brw_imm_ud(0); /* ex_desc */ |
| inst->src[2] = payload; |
| inst->src[3] = brw_null_reg(); |
| } |
| |
| static void |
| setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key, |
| fs_reg *dst, fs_reg color, unsigned components) |
| { |
| if (key->clamp_fragment_color) { |
| fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4); |
| assert(color.type == BRW_REGISTER_TYPE_F); |
| |
| for (unsigned i = 0; i < components; i++) |
| set_saturate(true, |
| bld.MOV(offset(tmp, bld, i), offset(color, bld, i))); |
| |
| color = tmp; |
| } |
| |
| for (unsigned i = 0; i < components; i++) |
| dst[i] = offset(color, bld, i); |
| } |
| |
| static void |
| lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, |
| const struct brw_wm_prog_data *prog_data, |
| const brw_wm_prog_key *key, |
| const fs_thread_payload &payload) |
| { |
| assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM); |
| const intel_device_info *devinfo = bld.shader->devinfo; |
| const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0]; |
| const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1]; |
| const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA]; |
| const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH]; |
| const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH]; |
| const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL]; |
| fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK]; |
| const unsigned components = |
| inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud; |
| |
| assert(inst->target != 0 || src0_alpha.file == BAD_FILE); |
| |
| /* We can potentially have a message length of up to 15, so we have to set |
| * base_mrf to either 0 or 1 in order to fit in m0..m15. |
| */ |
| fs_reg sources[15]; |
| int header_size = 2, payload_header_size; |
| unsigned length = 0; |
| |
| if (devinfo->ver < 6) { |
| /* TODO: Support SIMD32 on gfx4-5 */ |
| assert(bld.group() < 16); |
| |
| /* For gfx4-5, we always have a header consisting of g0 and g1. We have |
| * an implied MOV from g0,g1 to the start of the message. The MOV from |
| * g0 is handled by the hardware and the MOV from g1 is provided by the |
| * generator. This is required because, on gfx4-5, the generator may |
| * generate two write messages with different message lengths in order |
| * to handle AA data properly. |
| * |
| * Also, since the pixel mask goes in the g0 portion of the message and |
| * since render target writes are the last thing in the shader, we write |
| * the pixel mask directly into g0 and it will get copied as part of the |
| * implied write. |
| */ |
| if (prog_data->uses_kill) { |
| bld.exec_all().group(1, 0) |
| .MOV(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), |
| brw_sample_mask_reg(bld)); |
| } |
| |
| assert(length == 0); |
| length = 2; |
| } else if ((devinfo->verx10 <= 70 && |
| prog_data->uses_kill) || |
| (devinfo->ver < 11 && |
| (color1.file != BAD_FILE || key->nr_color_regions > 1))) { |
| /* From the Sandy Bridge PRM, volume 4, page 198: |
| * |
| * "Dispatched Pixel Enables. One bit per pixel indicating |
| * which pixels were originally enabled when the thread was |
| * dispatched. This field is only required for the end-of- |
| * thread message and on all dual-source messages." |
| */ |
| const fs_builder ubld = bld.exec_all().group(8, 0); |
| |
| fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); |
| if (bld.group() < 16) { |
| /* The header starts off as g0 and g1 for the first half */ |
| ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0), |
| BRW_REGISTER_TYPE_UD)); |
| } else { |
| /* The header starts off as g0 and g2 for the second half */ |
| assert(bld.group() < 32); |
| const fs_reg header_sources[2] = { |
| retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD), |
| retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD), |
| }; |
| ubld.LOAD_PAYLOAD(header, header_sources, 2, 0); |
| |
| /* Gfx12 will require additional fix-ups if we ever hit this path. */ |
| assert(devinfo->ver < 12); |
| } |
| |
| uint32_t g00_bits = 0; |
| |
| /* Set "Source0 Alpha Present to RenderTarget" bit in message |
| * header. |
| */ |
| if (src0_alpha.file != BAD_FILE) |
| g00_bits |= 1 << 11; |
| |
| /* Set computes stencil to render target */ |
| if (prog_data->computed_stencil) |
| g00_bits |= 1 << 14; |
| |
| if (g00_bits) { |
| /* OR extra bits into g0.0 */ |
| ubld.group(1, 0).OR(component(header, 0), |
| retype(brw_vec1_grf(0, 0), |
| BRW_REGISTER_TYPE_UD), |
| brw_imm_ud(g00_bits)); |
| } |
| |
| /* Set the render target index for choosing BLEND_STATE. */ |
| if (inst->target > 0) { |
| ubld.group(1, 0).MOV(component(header, 2), brw_imm_ud(inst->target)); |
| } |
| |
| if (prog_data->uses_kill) { |
| ubld.group(1, 0).MOV(retype(component(header, 15), |
| BRW_REGISTER_TYPE_UW), |
| brw_sample_mask_reg(bld)); |
| } |
| |
| assert(length == 0); |
| sources[0] = header; |
| sources[1] = horiz_offset(header, 8); |
| length = 2; |
| } |
| assert(length == 0 || length == 2); |
| header_size = length; |
| |
| if (payload.aa_dest_stencil_reg[0]) { |
| assert(inst->group < 16); |
| sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1)); |
| bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha") |
| .MOV(sources[length], |
| fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg[0], 0))); |
| length++; |
| } |
| |
| if (src0_alpha.file != BAD_FILE) { |
| for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) { |
| const fs_builder &ubld = bld.exec_all().group(8, i) |
| .annotate("FB write src0 alpha"); |
| const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_F); |
| ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8)); |
| setup_color_payload(ubld, key, &sources[length], tmp, 1); |
| length++; |
| } |
| } |
| |
| if (sample_mask.file != BAD_FILE) { |
| sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1), |
| BRW_REGISTER_TYPE_UD); |
| |
| /* Hand over gl_SampleMask. Only the lower 16 bits of each channel are |
| * relevant. Since it's unsigned single words one vgrf is always |
| * 16-wide, but only the lower or higher 8 channels will be used by the |
| * hardware when doing a SIMD8 write depending on whether we have |
| * selected the subspans for the first or second half respectively. |
| */ |
| assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4); |
| sample_mask.type = BRW_REGISTER_TYPE_UW; |
| sample_mask.stride *= 2; |
| |
| bld.exec_all().annotate("FB write oMask") |
| .MOV(horiz_offset(retype(sources[length], BRW_REGISTER_TYPE_UW), |
| inst->group % 16), |
| sample_mask); |
| length++; |
| } |
| |
| payload_header_size = length; |
| |
| setup_color_payload(bld, key, &sources[length], color0, components); |
| length += 4; |
| |
| if (color1.file != BAD_FILE) { |
| setup_color_payload(bld, key, &sources[length], color1, components); |
| length += 4; |
| } |
| |
| if (src_depth.file != BAD_FILE) { |
| sources[length] = src_depth; |
| length++; |
| } |
| |
| if (dst_depth.file != BAD_FILE) { |
| sources[length] = dst_depth; |
| length++; |
| } |
| |
| if (src_stencil.file != BAD_FILE) { |
| assert(devinfo->ver >= 9); |
| assert(bld.dispatch_width() == 8); |
| |
| /* XXX: src_stencil is only available on gfx9+. dst_depth is never |
| * available on gfx9+. As such it's impossible to have both enabled at the |
| * same time and therefore length cannot overrun the array. |
| */ |
| assert(length < 15); |
| |
| sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD); |
| bld.exec_all().annotate("FB write OS") |
| .MOV(retype(sources[length], BRW_REGISTER_TYPE_UB), |
| subscript(src_stencil, BRW_REGISTER_TYPE_UB, 0)); |
| length++; |
| } |
| |
| fs_inst *load; |
| if (devinfo->ver >= 7) { |
| /* Send from the GRF */ |
| fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F); |
| load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size); |
| payload.nr = bld.shader->alloc.allocate(regs_written(load)); |
| load->dst = payload; |
| |
| uint32_t msg_ctl = brw_fb_write_msg_control(inst, prog_data); |
| |
| inst->desc = |
| (inst->group / 16) << 11 | /* rt slot group */ |
| brw_fb_write_desc(devinfo, inst->target, msg_ctl, inst->last_rt, |
| prog_data->per_coarse_pixel_dispatch); |
| |
| uint32_t ex_desc = 0; |
| if (devinfo->ver >= 11) { |
| /* Set the "Render Target Index" and "Src0 Alpha Present" fields |
| * in the extended message descriptor, in lieu of using a header. |
| */ |
| ex_desc = inst->target << 12 | (src0_alpha.file != BAD_FILE) << 15; |
| |
| if (key->nr_color_regions == 0) |
| ex_desc |= 1 << 20; /* Null Render Target */ |
| } |
| inst->ex_desc = ex_desc; |
| |
| inst->opcode = SHADER_OPCODE_SEND; |
| inst->resize_sources(3); |
| inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE; |
| inst->src[0] = brw_imm_ud(0); |
| inst->src[1] = brw_imm_ud(0); |
| inst->src[2] = payload; |
| inst->mlen = regs_written(load); |
| inst->ex_mlen = 0; |
| inst->header_size = header_size; |
| inst->check_tdr = true; |
| inst->send_has_side_effects = true; |
| } else { |
| /* Send from the MRF */ |
| load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F), |
| sources, length, payload_header_size); |
| |
| /* On pre-SNB, we have to interlace the color values. LOAD_PAYLOAD |
| * will do this for us if we just give it a COMPR4 destination. |
| */ |
| if (devinfo->ver < 6 && bld.dispatch_width() == 16) |
| load->dst.nr |= BRW_MRF_COMPR4; |
| |
| if (devinfo->ver < 6) { |
| /* Set up src[0] for the implied MOV from grf0-1 */ |
| inst->resize_sources(1); |
| inst->src[0] = brw_vec8_grf(0, 0); |
| } else { |
| inst->resize_sources(0); |
| } |
| inst->base_mrf = 1; |
| inst->opcode = FS_OPCODE_FB_WRITE; |
| inst->mlen = regs_written(load); |
| inst->header_size = header_size; |
| } |
| } |
| |
| static void |
| lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst) |
| { |
| const intel_device_info *devinfo = bld.shader->devinfo; |
| const fs_builder &ubld = bld.exec_all().group(8, 0); |
| const unsigned length = 2; |
| const fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, length); |
| |
| if (bld.group() < 16) { |
| ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0), |
| BRW_REGISTER_TYPE_UD)); |
| } else { |
| assert(bld.group() < 32); |
| const fs_reg header_sources[] = { |
| retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD), |
| retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD) |
| }; |
| ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0); |
| |
| if (devinfo->ver >= 12) { |
| /* On Gfx12 the Viewport and Render Target Array Index fields (AKA |
| * Poly 0 Info) are provided in r1.1 instead of r0.0, and the render |
| * target message header format was updated accordingly -- However |
| * the updated format only works for the lower 16 channels in a |
| * SIMD32 thread, since the higher 16 channels want the subspan data |
| * from r2 instead of r1, so we need to copy over the contents of |
| * r1.1 in order to fix things up. |
| */ |
| ubld.group(1, 0).MOV(component(header, 9), |
| retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD)); |
| } |
| } |
| |
| /* BSpec 12470 (Gfx8-11), BSpec 47842 (Gfx12+) : |
| * |
| * "Must be zero for Render Target Read message." |
| * |
| * For bits : |
| * - 14 : Stencil Present to Render Target |
| * - 13 : Source Depth Present to Render Target |
| * - 12 : oMask to Render Target |
| * - 11 : Source0 Alpha Present to Render Target |
| */ |
| ubld.group(1, 0).AND(component(header, 0), |
| component(header, 0), |
| brw_imm_ud(~INTEL_MASK(14, 11))); |
| |
| inst->resize_sources(1); |
| inst->src[0] = header; |
| inst->opcode = FS_OPCODE_FB_READ; |
| inst->mlen = length; |
| inst->header_size = length; |
| } |
| |
| static void |
| lower_sampler_logical_send_gfx4(const fs_builder &bld, fs_inst *inst, opcode op, |
| const fs_reg &coordinate, |
| const fs_reg &shadow_c, |
| const fs_reg &lod, const fs_reg &lod2, |
| const fs_reg &surface, |
| const fs_reg &sampler, |
| unsigned coord_components, |
| unsigned grad_components) |
| { |
| const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB || |
| op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS); |
| fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F); |
| fs_reg msg_end = msg_begin; |
| |
| /* g0 header. */ |
| msg_end = offset(msg_end, bld.group(8, 0), 1); |
| |
| for (unsigned i = 0; i < coord_components; i++) |
| bld.MOV(retype(offset(msg_end, bld, i), coordinate.type), |
| offset(coordinate, bld, i)); |
| |
| msg_end = offset(msg_end, bld, coord_components); |
| |
| /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8 |
| * require all three components to be present and zero if they are unused. |
| */ |
| if (coord_components > 0 && |
| (has_lod || shadow_c.file != BAD_FILE || |
| (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) { |
| assert(coord_components <= 3); |
| for (unsigned i = 0; i < 3 - coord_components; i++) |
| bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f)); |
| |
| msg_end = offset(msg_end, bld, 3 - coord_components); |
| } |
| |
| if (op == SHADER_OPCODE_TXD) { |
| /* TXD unsupported in SIMD16 mode. */ |
| assert(bld.dispatch_width() == 8); |
| |
| /* the slots for u and v are always present, but r is optional */ |
| if (coord_components < 2) |
| msg_end = offset(msg_end, bld, 2 - coord_components); |
| |
| /* P = u, v, r |
| * dPdx = dudx, dvdx, drdx |
| * dPdy = dudy, dvdy, drdy |
| * |
| * 1-arg: Does not exist. |
| * |
| * 2-arg: dudx dvdx dudy dvdy |
| * dPdx.x dPdx.y dPdy.x dPdy.y |
| * m4 m5 m6 m7 |
| * |
| * 3-arg: dudx dvdx drdx dudy dvdy drdy |
| * dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z |
| * m5 m6 m7 m8 m9 m10 |
| */ |
| for (unsigned i = 0; i < grad_components; i++) |
| bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i)); |
| |
| msg_end = offset(msg_end, bld, MAX2(grad_components, 2)); |
| |
| for (unsigned i = 0; i < grad_components; i++) |
| bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i)); |
| |
| msg_end = offset(msg_end, bld, MAX2(grad_components, 2)); |
| } |
| |
| if (has_lod) { |
| /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without* |
| * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode. |
| */ |
| assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 : |
| bld.dispatch_width() == 16); |
| |
| const brw_reg_type type = |
| (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ? |
| BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F); |
| bld.MOV(retype(msg_end, type), lod); |
| msg_end = offset(msg_end, bld, 1); |
| } |
| |
| if (shadow_c.file != BAD_FILE) { |
| if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) { |
| /* There's no plain shadow compare message, so we use shadow |
| * compare with a bias of 0.0. |
| */ |
| bld.MOV(msg_end, brw_imm_f(0.0f)); |
| msg_end = offset(msg_end, bld, 1); |
| } |
| |
| bld.MOV(msg_end, shadow_c); |
| msg_end = offset(msg_end, bld, 1); |
| } |
| |
| inst->opcode = op; |
| inst->src[0] = reg_undef; |
| inst->src[1] = surface; |
| inst->src[2] = sampler; |
| inst->resize_sources(3); |
| inst->base_mrf = msg_begin.nr; |
| inst->mlen = msg_end.nr - msg_begin.nr; |
| inst->header_size = 1; |
| } |
| |
| static void |
| lower_sampler_logical_send_gfx5(const fs_builder &bld, fs_inst *inst, opcode op, |
| const fs_reg &coordinate, |
| const fs_reg &shadow_c, |
| const fs_reg &lod, const fs_reg &lod2, |
| const fs_reg &sample_index, |
| const fs_reg &surface, |
| const fs_reg &sampler, |
| unsigned coord_components, |
| unsigned grad_components) |
| { |
| fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F); |
| fs_reg msg_coords = message; |
| unsigned header_size = 0; |
| |
| if (inst->offset != 0) { |
| /* The offsets set up by the visitor are in the m1 header, so we can't |
| * go headerless. |
| */ |
| header_size = 1; |
| message.nr--; |
| } |
| |
| for (unsigned i = 0; i < coord_components; i++) |
| bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type), |
| offset(coordinate, bld, i)); |
| |
| fs_reg msg_end = offset(msg_coords, bld, coord_components); |
| fs_reg msg_lod = offset(msg_coords, bld, 4); |
| |
| if (shadow_c.file != BAD_FILE) { |
| fs_reg msg_shadow = msg_lod; |
| bld.MOV(msg_shadow, shadow_c); |
| msg_lod = offset(msg_shadow, bld, 1); |
| msg_end = msg_lod; |
| } |
| |
| switch (op) { |
| case SHADER_OPCODE_TXL: |
| case FS_OPCODE_TXB: |
| bld.MOV(msg_lod, lod); |
| msg_end = offset(msg_lod, bld, 1); |
| break; |
| case SHADER_OPCODE_TXD: |
| /** |
| * P = u, v, r |
| * dPdx = dudx, dvdx, drdx |
| * dPdy = dudy, dvdy, drdy |
| * |
| * Load up these values: |
| * - dudx dudy dvdx dvdy drdx drdy |
| * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z |
| */ |
| msg_end = msg_lod; |
| for (unsigned i = 0; i < grad_components; i++) { |
| bld.MOV(msg_end, offset(lod, bld, i)); |
| msg_end = offset(msg_end, bld, 1); |
| |
| bld.MOV(msg_end, offset(lod2, bld, i)); |
| msg_end = offset(msg_end, bld, 1); |
| } |
| break; |
| case SHADER_OPCODE_TXS: |
| msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD); |
| bld.MOV(msg_lod, lod); |
| msg_end = offset(msg_lod, bld, 1); |
| break; |
| case SHADER_OPCODE_TXF: |
| msg_lod = offset(msg_coords, bld, 3); |
| bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod); |
| msg_end = offset(msg_lod, bld, 1); |
| break; |
| case SHADER_OPCODE_TXF_CMS: |
| msg_lod = offset(msg_coords, bld, 3); |
| /* lod */ |
| bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)); |
| /* sample index */ |
| bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index); |
| msg_end = offset(msg_lod, bld, 2); |
| break; |
| default: |
| break; |
| } |
| |
| inst->opcode = op; |
| inst->src[0] = reg_undef; |
| inst->src[1] = surface; |
| inst->src[2] = sampler; |
| inst->resize_sources(3); |
| inst->base_mrf = message.nr; |
| inst->mlen = msg_end.nr - message.nr; |
| inst->header_size = header_size; |
| |
| /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */ |
| assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE); |
| } |
| |
| static bool |
| is_high_sampler(const struct intel_device_info *devinfo, const fs_reg &sampler) |
| { |
| if (devinfo->verx10 <= 70) |
| return false; |
| |
| return sampler.file != IMM || sampler.ud >= 16; |
| } |
| |
| static unsigned |
| sampler_msg_type(const intel_device_info *devinfo, |
| opcode opcode, bool shadow_compare) |
| { |
| assert(devinfo->ver >= 5); |
| switch (opcode) { |
| case SHADER_OPCODE_TEX: |
| return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE : |
| GFX5_SAMPLER_MESSAGE_SAMPLE; |
| case FS_OPCODE_TXB: |
| return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE : |
| GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS; |
| case SHADER_OPCODE_TXL: |
| return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE : |
| GFX5_SAMPLER_MESSAGE_SAMPLE_LOD; |
| case SHADER_OPCODE_TXL_LZ: |
| return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ : |
| GFX9_SAMPLER_MESSAGE_SAMPLE_LZ; |
| case SHADER_OPCODE_TXS: |
| case SHADER_OPCODE_IMAGE_SIZE_LOGICAL: |
| return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO; |
| case SHADER_OPCODE_TXD: |
| assert(!shadow_compare || devinfo->verx10 >= 75); |
| return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE : |
| GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS; |
| case SHADER_OPCODE_TXF: |
| return GFX5_SAMPLER_MESSAGE_SAMPLE_LD; |
| case SHADER_OPCODE_TXF_LZ: |
| assert(devinfo->ver >= 9); |
| return GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ; |
| case SHADER_OPCODE_TXF_CMS_W: |
| assert(devinfo->ver >= 9); |
| return GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W; |
| case SHADER_OPCODE_TXF_CMS: |
| return devinfo->ver >= 7 ? GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS : |
| GFX5_SAMPLER_MESSAGE_SAMPLE_LD; |
| case SHADER_OPCODE_TXF_UMS: |
| assert(devinfo->ver >= 7); |
| return GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DSS; |
| case SHADER_OPCODE_TXF_MCS: |
| assert(devinfo->ver >= 7); |
| return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS; |
| case SHADER_OPCODE_LOD: |
| return GFX5_SAMPLER_MESSAGE_LOD; |
| case SHADER_OPCODE_TG4: |
| assert(devinfo->ver >= 7); |
| return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C : |
| GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4; |
| break; |
| case SHADER_OPCODE_TG4_OFFSET: |
| assert(devinfo->ver >= 7); |
| return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C : |
| GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO; |
| case SHADER_OPCODE_SAMPLEINFO: |
| return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO; |
| default: |
| unreachable("not reached"); |
| } |
| } |
| |
| /** |
| * Emit a LOAD_PAYLOAD instruction while ensuring the sources are aligned to |
| * the given requested_alignment_sz. |
| */ |
| static fs_inst * |
| emit_load_payload_with_padding(const fs_builder &bld, const fs_reg &dst, |
| const fs_reg *src, unsigned sources, |
| unsigned header_size, |
| unsigned requested_alignment_sz) |
| { |
| unsigned length = 0; |
| unsigned num_srcs = |
| sources * DIV_ROUND_UP(requested_alignment_sz, bld.dispatch_width()); |
| fs_reg *src_comps = new fs_reg[num_srcs]; |
| |
| for (unsigned i = 0; i < header_size; i++) |
| src_comps[length++] = src[i]; |
| |
| for (unsigned i = header_size; i < sources; i++) { |
| unsigned src_sz = |
| retype(dst, src[i].type).component_size(bld.dispatch_width()); |
| const enum brw_reg_type padding_payload_type = |
| brw_reg_type_from_bit_size(type_sz(src[i].type) * 8, |
| BRW_REGISTER_TYPE_UD); |
| |
| src_comps[length++] = src[i]; |
| |
| /* Expand the real sources if component of requested payload type is |
| * larger than real source component. |
| */ |
| if (src_sz < requested_alignment_sz) { |
| for (unsigned j = 0; j < (requested_alignment_sz / src_sz) - 1; j++) { |
| src_comps[length++] = retype(fs_reg(), padding_payload_type); |
| } |
| } |
| } |
| |
| fs_inst *inst = bld.LOAD_PAYLOAD(dst, src_comps, length, header_size); |
| delete[] src_comps; |
| |
| return inst; |
| } |
| |
| static void |
| lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op, |
| const fs_reg &coordinate, |
| const fs_reg &shadow_c, |
| fs_reg lod, const fs_reg &lod2, |
| const fs_reg &min_lod, |
| const fs_reg &sample_index, |
| const fs_reg &mcs, |
| const fs_reg &surface, |
| const fs_reg &sampler, |
| const fs_reg &surface_handle, |
| const fs_reg &sampler_handle, |
| const fs_reg &tg4_offset, |
| unsigned payload_type_bit_size, |
| unsigned coord_components, |
| unsigned grad_components) |
| { |
| const intel_device_info *devinfo = bld.shader->devinfo; |
| const enum brw_reg_type payload_type = |
| brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_F); |
| const enum brw_reg_type payload_unsigned_type = |
| brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_UD); |
| const enum brw_reg_type payload_signed_type = |
| brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_D); |
| unsigned reg_width = bld.dispatch_width() / 8; |
| unsigned header_size = 0, length = 0; |
| fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE]; |
| for (unsigned i = 0; i < ARRAY_SIZE(sources); i++) |
| sources[i] = bld.vgrf(payload_type); |
| |
| /* We must have exactly one of surface/sampler and surface/sampler_handle */ |
| assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE)); |
| assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE)); |
| |
| if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET || |
| inst->offset != 0 || inst->eot || |
| op == SHADER_OPCODE_SAMPLEINFO || |
| sampler_handle.file != BAD_FILE || |
| is_high_sampler(devinfo, sampler)) { |
| /* For general texture offsets (no txf workaround), we need a header to |
| * put them in. |
| * |
| * TG4 needs to place its channel select in the header, for interaction |
| * with ARB_texture_swizzle. The sampler index is only 4-bits, so for |
| * larger sampler numbers we need to offset the Sampler State Pointer in |
| * the header. |
| */ |
| fs_reg header = retype(sources[0], BRW_REGISTER_TYPE_UD); |
| header_size = 1; |
| length++; |
| |
| /* If we're requesting fewer than four channels worth of response, |
| * and we have an explicit header, we need to set up the sampler |
| * writemask. It's reversed from normal: 1 means "don't write". |
| */ |
| if (!inst->eot && regs_written(inst) != 4 * reg_width) { |
| assert(regs_written(inst) % reg_width == 0); |
| unsigned mask = ~((1 << (regs_written(inst) / reg_width)) - 1) & 0xf; |
| inst->offset |= mask << 12; |
| } |
| |
| /* Build the actual header */ |
| const fs_builder ubld = bld.exec_all().group(8, 0); |
| const fs_builder ubld1 = ubld.group(1, 0); |
| ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); |
| if (inst->offset) { |
| ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset)); |
| } else if (bld.shader->stage != MESA_SHADER_VERTEX && |
| bld.shader->stage != MESA_SHADER_FRAGMENT) { |
| /* The vertex and fragment stages have g0.2 set to 0, so |
| * header0.2 is 0 when g0 is copied. Other stages may not, so we |
| * must set it to 0 to avoid setting undesirable bits in the |
| * message. |
| */ |
| ubld1.MOV(component(header, 2), brw_imm_ud(0)); |
| } |
| |
| if (sampler_handle.file != BAD_FILE) { |
| /* Bindless sampler handles aren't relative to the sampler state |
| * pointer passed into the shader through SAMPLER_STATE_POINTERS_*. |
| * Instead, it's an absolute pointer relative to dynamic state base |
| * address. |
| * |
| * Sampler states are 16 bytes each and the pointer we give here has |
| * to be 32-byte aligned. In order to avoid more indirect messages |
| * than required, we assume that all bindless sampler states are |
| * 32-byte aligned. This sacrifices a bit of general state base |
| * address space but means we can do something more efficient in the |
| * shader. |
| */ |
| ubld1.MOV(component(header, 3), sampler_handle); |
| } else if (is_high_sampler(devinfo, sampler)) { |
| fs_reg sampler_state_ptr = |
| retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD); |
| |
| /* Gfx11+ sampler message headers include bits in 4:0 which conflict |
| * with the ones included in g0.3 bits 4:0. Mask them out. |
| */ |
| if (devinfo->ver >= 11) { |
| sampler_state_ptr = ubld1.vgrf(BRW_REGISTER_TYPE_UD); |
| ubld1.AND(sampler_state_ptr, |
| retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD), |
| brw_imm_ud(INTEL_MASK(31, 5))); |
| } |
| |
| if (sampler.file == BRW_IMMEDIATE_VALUE) { |
| assert(sampler.ud >= 16); |
| const int sampler_state_size = 16; /* 16 bytes */ |
| |
| ubld1.ADD(component(header, 3), sampler_state_ptr, |
| brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size)); |
| } else { |
| fs_reg tmp = ubld1.vgrf(BRW_REGISTER_TYPE_UD); |
| ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0)); |
| ubld1.SHL(tmp, tmp, brw_imm_ud(4)); |
| ubld1.ADD(component(header, 3), sampler_state_ptr, tmp); |
| } |
| } else if (devinfo->ver >= 11) { |
| /* Gfx11+ sampler message headers include bits in 4:0 which conflict |
| * with the ones included in g0.3 bits 4:0. Mask them out. |
| */ |
| ubld1.AND(component(header, 3), |
| retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD), |
| brw_imm_ud(INTEL_MASK(31, 5))); |
| } |
| } |
| |
| if (shadow_c.file != BAD_FILE) { |
| bld.MOV(sources[length], shadow_c); |
| length++; |
| } |
| |
| bool coordinate_done = false; |
| |
| /* Set up the LOD info */ |
| switch (op) { |
| case FS_OPCODE_TXB: |
| case SHADER_OPCODE_TXL: |
| if (devinfo->ver >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) { |
| op = SHADER_OPCODE_TXL_LZ; |
| break; |
| } |
| bld.MOV(sources[length], lod); |
| length++; |
| break; |
| case SHADER_OPCODE_TXD: |
| /* TXD should have been lowered in SIMD16 mode. */ |
| assert(bld.dispatch_width() == 8); |
| |
| /* Load dPdx and the coordinate together: |
| * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z |
| */ |
| for (unsigned i = 0; i < coord_components; i++) { |
| bld.MOV(sources[length++], offset(coordinate, bld, i)); |
| |
| /* For cube map array, the coordinate is (u,v,r,ai) but there are |
| * only derivatives for (u, v, r). |
| */ |
| if (i < grad_components) { |
| bld.MOV(sources[length++], offset(lod, bld, i)); |
| bld.MOV(sources[length++], offset(lod2, bld, i)); |
| } |
| } |
| |
| coordinate_done = true; |
| break; |
| case SHADER_OPCODE_TXS: |
| bld.MOV(retype(sources[length], payload_unsigned_type), lod); |
| length++; |
| break; |
| case SHADER_OPCODE_IMAGE_SIZE_LOGICAL: |
| /* We need an LOD; just use 0 */ |
| bld.MOV(retype(sources[length], payload_unsigned_type), brw_imm_ud(0)); |
| length++; |
| break; |
| case SHADER_OPCODE_TXF: |
| /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. |
| * On Gfx9 they are u, v, lod, r |
| */ |
| bld.MOV(retype(sources[length++], payload_signed_type), coordinate); |
| |
| if (devinfo->ver >= 9) { |
| if (coord_components >= 2) { |
| bld.MOV(retype(sources[length], payload_signed_type), |
| offset(coordinate, bld, 1)); |
| } else { |
| sources[length] = brw_imm_d(0); |
| } |
| length++; |
| } |
| |
| if (devinfo->ver >= 9 && lod.is_zero()) { |
| op = SHADER_OPCODE_TXF_LZ; |
| } else { |
| bld.MOV(retype(sources[length], payload_signed_type), lod); |
| length++; |
| } |
| |
| for (unsigned i = devinfo->ver >= 9 ? 2 : 1; i < coord_components; i++) |
| bld.MOV(retype(sources[length++], payload_signed_type), |
| offset(coordinate, bld, i)); |
| |
| coordinate_done = true; |
| break; |
| |
| case SHADER_OPCODE_TXF_CMS: |
| case SHADER_OPCODE_TXF_CMS_W: |
| case SHADER_OPCODE_TXF_UMS: |
| case SHADER_OPCODE_TXF_MCS: |
| if (op == SHADER_OPCODE_TXF_UMS || |
| op == SHADER_OPCODE_TXF_CMS || |
| op == SHADER_OPCODE_TXF_CMS_W) { |
| bld.MOV(retype(sources[length++], payload_unsigned_type), sample_index); |
| } |
| |
| /* Data from the multisample control surface. */ |
| if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) { |
| unsigned num_mcs_components = 1; |
| |
| /* From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs - |
| * Shared Functions - 3D Sampler - Messages - Message Format: |
| * |
| * ld2dms_w si mcs0 mcs1 mcs2 mcs3 u v r |
| */ |
| if (devinfo->verx10 >= 125 && op == SHADER_OPCODE_TXF_CMS_W) |
| num_mcs_components = 4; |
| else if (op == SHADER_OPCODE_TXF_CMS_W) |
| num_mcs_components = 2; |
| |
| for (unsigned i = 0; i < num_mcs_components; ++i) { |
| bld.MOV(retype(sources[length++], payload_unsigned_type), |
| mcs.file == IMM ? mcs : offset(mcs, bld, i)); |
| } |
| } |
| |
| /* There is no offsetting for this message; just copy in the integer |
| * texture coordinates. |
| */ |
| for (unsigned i = 0; i < coord_components; i++) |
| bld.MOV(retype(sources[length++], payload_signed_type), |
| offset(coordinate, bld, i)); |
| |
| coordinate_done = true; |
| break; |
| case SHADER_OPCODE_TG4_OFFSET: |
| /* More crazy intermixing */ |
| for (unsigned i = 0; i < 2; i++) /* u, v */ |
| bld.MOV(sources[length++], offset(coordinate, bld, i)); |
| |
| for (unsigned i = 0; i < 2; i++) /* offu, offv */ |
| bld.MOV(retype(sources[length++], payload_signed_type), |
| offset(tg4_offset, bld, i)); |
| |
| if (coord_components == 3) /* r if present */ |
| bld.MOV(sources[length++], offset(coordinate, bld, 2)); |
| |
| coordinate_done = true; |
| break; |
| default: |
| break; |
| } |
| |
| /* Set up the coordinate (except for cases where it was done above) */ |
| if (!coordinate_done) { |
| for (unsigned i = 0; i < coord_components; i++) |
| bld.MOV(retype(sources[length++], payload_type), |
| offset(coordinate, bld, i)); |
| } |
| |
| if (min_lod.file != BAD_FILE) { |
| /* Account for all of the missing coordinate sources */ |
| if (op == SHADER_OPCODE_TXD && devinfo->verx10 >= 125) { |
| /* On DG2 and newer platforms, sample_d can only be used with 1D and |
| * 2D surfaces, so the maximum number of gradient components is 2. |
| * In spite of this limitation, the Bspec lists a mysterious R |
| * component before the min_lod, so the maximum coordinate components |
| * is 3. |
| * |
| * Wa_1209978020 |
| */ |
| length += 3 - coord_components; |
| length += (2 - grad_components) * 2; |
| } else { |
| length += 4 - coord_components; |
| if (op == SHADER_OPCODE_TXD) |
| length += (3 - grad_components) * 2; |
| } |
| |
| bld.MOV(sources[length++], min_lod); |
| |
| /* Wa_14014595444: Populate MLOD as parameter 5 (twice). */ |
| if (devinfo->verx10 == 125 && op == FS_OPCODE_TXB && |
| !inst->shadow_compare) |
| bld.MOV(sources[length++], min_lod); |
| } |
| |
| const fs_reg src_payload = |
| fs_reg(VGRF, bld.shader->alloc.allocate(length * reg_width), |
| BRW_REGISTER_TYPE_F); |
| /* In case of 16-bit payload each component takes one full register in |
| * both SIMD8H and SIMD16H modes. In both cases one reg can hold 16 |
| * elements. In SIMD8H case hardware simply expects the components to be |
| * padded (i.e., aligned on reg boundary). |
| */ |
| fs_inst *load_payload_inst = |
| emit_load_payload_with_padding(bld, src_payload, sources, length, |
| header_size, REG_SIZE); |
| unsigned mlen = load_payload_inst->size_written / REG_SIZE; |
| unsigned simd_mode = 0; |
| if (payload_type_bit_size == 16) { |
| assert(devinfo->ver >= 11); |
| simd_mode = inst->exec_size <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H : |
| GFX10_SAMPLER_SIMD_MODE_SIMD16H; |
| } else { |
| simd_mode = inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 : |
| BRW_SAMPLER_SIMD_MODE_SIMD16; |
| } |
| |
| /* Generate the SEND. */ |
| inst->opcode = SHADER_OPCODE_SEND; |
| inst->mlen = mlen; |
| inst->header_size = header_size; |
| |
| const unsigned msg_type = |
| sampler_msg_type(devinfo, op, inst->shadow_compare); |
| |
| inst->sfid = BRW_SFID_SAMPLER; |
| if (surface.file == IMM && |
| (sampler.file == IMM || sampler_handle.file != BAD_FILE)) { |
| inst->desc = brw_sampler_desc(devinfo, surface.ud, |
| sampler.file == IMM ? sampler.ud % 16 : 0, |
| msg_type, |
| simd_mode, |
| 0 /* return_format unused on gfx7+ */); |
| inst->src[0] = brw_imm_ud(0); |
| inst->src[1] = brw_imm_ud(0); |
| } else if (surface_handle.file != BAD_FILE) { |
| /* Bindless surface */ |
| assert(devinfo->ver >= 9); |
| inst->desc = brw_sampler_desc(devinfo, |
| GFX9_BTI_BINDLESS, |
| sampler.file == IMM ? sampler.ud % 16 : 0, |
| msg_type, |
| simd_mode, |
| 0 /* return_format unused on gfx7+ */); |
| |
| /* For bindless samplers, the entire address is included in the message |
| * header so we can leave the portion in the message descriptor 0. |
| */ |
| if (sampler_handle.file != BAD_FILE || sampler.file == IMM) { |
| inst->src[0] = brw_imm_ud(0); |
| } else { |
| const fs_builder ubld = bld.group(1, 0).exec_all(); |
| fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD); |
| ubld.SHL(desc, sampler, brw_imm_ud(8)); |
| inst->src[0] = desc; |
| } |
| |
| /* We assume that the driver provided the handle in the top 20 bits so |
| * we can use the surface handle directly as the extended descriptor. |
| */ |
| inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD); |
| } else { |
| /* Immediate portion of the descriptor */ |
| inst->desc = brw_sampler_desc(devinfo, |
| 0, /* surface */ |
| 0, /* sampler */ |
| msg_type, |
| simd_mode, |
| 0 /* return_format unused on gfx7+ */); |
| const fs_builder ubld = bld.group(1, 0).exec_all(); |
| fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD); |
| if (surface.equals(sampler)) { |
| /* This case is common in GL */ |
| ubld.MUL(desc, surface, brw_imm_ud(0x101)); |
| } else { |
| if (sampler_handle.file != BAD_FILE) { |
| ubld.MOV(desc, surface); |
| } else if (sampler.file == IMM) { |
| ubld.OR(desc, surface, brw_imm_ud(sampler.ud << 8)); |
| } else { |
| ubld.SHL(desc, sampler, brw_imm_ud(8)); |
| ubld.OR(desc, desc, surface); |
| } |
| } |
| ubld.AND(desc, desc, brw_imm_ud(0xfff)); |
| |
| inst->src[0] = component(desc, 0); |
| inst->src[1] = brw_imm_ud(0); /* ex_desc */ |
| } |
| |
| inst->ex_desc = 0; |
| |
| inst->src[2] = src_payload; |
| inst->resize_sources(3); |
| |
| if (inst->eot) { |
| /* EOT sampler messages don't make sense to split because it would |
| * involve ending half of the thread early. |
| */ |
| assert(inst->group == 0); |
| /* We need to use SENDC for EOT sampler messages */ |
| inst->check_tdr = true; |
| inst->send_has_side_effects = true; |
| } |
| |
| /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */ |
| assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE); |
| } |
| |
| static unsigned |
| get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo, |
| opcode op, const fs_reg *src) |
| { |
| unsigned src_type_size = 0; |
| |
| /* All sources need to have the same size, therefore seek the first valid |
| * and take the size from there. |
| */ |
| for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) { |
| if (src[i].file != BAD_FILE) { |
| src_type_size = brw_reg_type_to_size(src[i].type); |
| break; |
| } |
| } |
| |
| assert(src_type_size == 2 || src_type_size == 4); |
| |
| #ifndef NDEBUG |
| /* Make sure all sources agree. On gfx12 this doesn't hold when sampling |
| * compressed multisampled surfaces. There the payload contains MCS data |
| * which is already in 16-bits unlike the other parameters that need forced |
| * conversion. |
| */ |
| if (devinfo->verx10 < 125 || |
| (op != SHADER_OPCODE_TXF_CMS_W && |
| op != SHADER_OPCODE_TXF_CMS)) { |
| for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) { |
| assert(src[i].file == BAD_FILE || |
| brw_reg_type_to_size(src[i].type) == src_type_size); |
| } |
| } |
| #endif |
| |
| if (devinfo->verx10 < 125) |
| return src_type_size * 8; |
| |
| /* Force conversion from 32-bit sources to 16-bit payload. From the XeHP Bspec: |
| * 3D and GPGPU Programs - Shared Functions - 3D Sampler - Messages - Message |
| * Format [GFX12:HAS:1209977870] * |
| * |
| * ld2dms_w SIMD8H and SIMD16H Only |
| * ld_mcs SIMD8H and SIMD16H Only |
| * ld2dms REMOVEDBY(GEN:HAS:1406788836) |
| */ |
| |
| if (op == SHADER_OPCODE_TXF_CMS_W || |
| op == SHADER_OPCODE_TXF_CMS || |
| op == SHADER_OPCODE_TXF_UMS || |
| op == SHADER_OPCODE_TXF_MCS) |
| src_type_size = 2; |
| |
| return src_type_size * 8; |
| } |
| |
| static void |
| lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op) |
| { |
| const intel_device_info *devinfo = bld.shader->devinfo; |
| const fs_reg &coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE]; |
| const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C]; |
| const fs_reg &lod = inst->src[TEX_LOGICAL_SRC_LOD]; |
| const fs_reg &lod2 = inst->src[TEX_LOGICAL_SRC_LOD2]; |
| const fs_reg &min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD]; |
| const fs_reg &sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX]; |
| const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS]; |
| const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE]; |
| const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER]; |
| const fs_reg &surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE]; |
| const fs_reg &sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE]; |
| const fs_reg &tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET]; |
| assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM); |
| const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud; |
| assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM); |
| const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud; |
| |
| if (devinfo->ver >= 7) { |
| const unsigned msg_payload_type_bit_size = |
| get_sampler_msg_payload_type_bit_size(devinfo, op, inst->src); |
| |
| /* 16-bit payloads are available only on gfx11+ */ |
| assert(msg_payload_type_bit_size != 16 || devinfo->ver >= 11); |
| |
| lower_sampler_logical_send_gfx7(bld, inst, op, coordinate, |
| shadow_c, lod, lod2, min_lod, |
| sample_index, |
| mcs, surface, sampler, |
| surface_handle, sampler_handle, |
| tg4_offset, |
| msg_payload_type_bit_size, |
| coord_components, grad_components); |
| } else if (devinfo->ver >= 5) { |
| lower_sampler_logical_send_gfx5(bld, inst, op, coordinate, |
| shadow_c, lod, lod2, sample_index, |
| surface, sampler, |
| coord_components, grad_components); |
| } else { |
| lower_sampler_logical_send_gfx4(bld, inst, op, coordinate, |
| shadow_c, lod, lod2, |
| surface, sampler, |
| coord_components, grad_components); |
| } |
| } |
| |
| /** |
| * Predicate the specified instruction on the vector mask. |
| */ |
| static void |
| emit_predicate_on_vector_mask(const fs_builder &bld, fs_inst *inst) |
| { |
| assert(bld.shader->stage == MESA_SHADER_FRAGMENT && |
| bld.group() == inst->group && |
| bld.dispatch_width() == inst->exec_size); |
| |
| const fs_builder ubld = bld.exec_all().group(1, 0); |
| |
| const fs_visitor *v = static_cast<const fs_visitor *>(bld.shader); |
| const fs_reg vector_mask = ubld.vgrf(BRW_REGISTER_TYPE_UW); |
| ubld.emit(SHADER_OPCODE_READ_SR_REG, vector_mask, brw_imm_ud(3)); |
| const unsigned subreg = sample_mask_flag_subreg(v); |
| |
| ubld.MOV(brw_flag_subreg(subreg + inst->group / 16), vector_mask); |
| |
| if (inst->predicate) { |
| assert(inst->predicate == BRW_PREDICATE_NORMAL); |
| assert(!inst->predicate_inverse); |
| assert(inst->flag_subreg == 0); |
| /* Combine the vector mask with the existing predicate by using a |
| * vertical predication mode. |
| */ |
| inst->predicate = BRW_PREDICATE_ALIGN1_ALLV; |
| } else { |
| inst->flag_subreg = subreg; |
| inst->predicate = BRW_PREDICATE_NORMAL; |
| inst->predicate_inverse = false; |
| } |
| } |
| |
| static void |
| setup_surface_descriptors(const fs_builder &bld, fs_inst *inst, uint32_t desc, |
| const fs_reg &surface, const fs_reg &surface_handle) |
| { |
| const ASSERTED intel_device_info *devinfo = bld.shader->devinfo; |
| |
| /* We must have exactly one of surface and surface_handle */ |
| assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE)); |
| |
| if (surface.file == IMM) { |
| inst->desc = desc | (surface.ud & 0xff); |
| inst->src[0] = brw_imm_ud(0); |
| inst->src[1] = brw_imm_ud(0); /* ex_desc */ |
| } else if (surface_handle.file != BAD_FILE) { |
| /* Bindless surface */ |
| assert(devinfo->ver >= 9); |
| inst->desc = desc | GFX9_BTI_BINDLESS; |
| inst->src[0] = brw_imm_ud(0); |
| |
| /* We assume that the driver provided the handle in the top 20 bits so |
| * we can use the surface handle directly as the extended descriptor. |
| */ |
| inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD); |
| } else { |
| inst->desc = desc; |
| const fs_builder ubld = bld.exec_all().group(1, 0); |
| fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD); |
| ubld.AND(tmp, surface, brw_imm_ud(0xff)); |
| inst->src[0] = component(tmp, 0); |
| inst->src[1] = brw_imm_ud(0); /* ex_desc */ |
| } |
| } |
| |
| static void |
| lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) |
| { |
| const intel_device_info *devinfo = bld.shader->devinfo; |
| |
| /* Get the logical send arguments. */ |
| const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS]; |
| const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA]; |
| const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE]; |
| const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE]; |
| const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS]; |
| const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG]; |
| const fs_reg &allow_sample_mask = |
| inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK]; |
| assert(arg.file == IMM); |
| assert(allow_sample_mask.file == IMM); |
| |
| /* Calculate the total number of components of the payload. */ |
| const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS); |
| const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA); |
| |
| const bool is_typed_access = |
| inst->opcode == SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL || |
| inst->opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL || |
| inst->opcode == SHADER_OPCODE_TYPED_ATOMIC_LOGICAL; |
| |
| const bool is_surface_access = is_typed_access || |
| inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL || |
| inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL || |
| inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL; |
| |
| const bool is_stateless = |
| surface.file == IMM && (surface.ud == BRW_BTI_STATELESS || |
| surface.ud == GFX8_BTI_STATELESS_NON_COHERENT); |
| |
| const bool has_side_effects = inst->has_side_effects(); |
| |
| fs_reg sample_mask = allow_sample_mask.ud ? brw_sample_mask_reg(bld) : |
| fs_reg(brw_imm_d(0xffff)); |
| |
| /* From the BDW PRM Volume 7, page 147: |
| * |
| * "For the Data Cache Data Port*, the header must be present for the |
| * following message types: [...] Typed read/write/atomics" |
| * |
| * Earlier generations have a similar wording. Because of this restriction |
| * we don't attempt to implement sample masks via predication for such |
| * messages prior to Gfx9, since we have to provide a header anyway. On |
| * Gfx11+ the header has been removed so we can only use predication. |
| * |
| * For all stateless A32 messages, we also need a header |
| */ |
| fs_reg header; |
| if ((devinfo->ver < 9 && is_typed_access) || is_stateless) { |
| fs_builder ubld = bld.exec_all().group(8, 0); |
| header = ubld.vgrf(BRW_REGISTER_TYPE_UD); |
| if (is_stateless) { |
| assert(!is_surface_access); |
| ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header); |
| } else { |
| ubld.MOV(header, brw_imm_d(0)); |
| if (is_surface_access) |
| ubld.group(1, 0).MOV(component(header, 7), sample_mask); |
| } |
| } |
| const unsigned header_sz = header.file != BAD_FILE ? 1 : 0; |
| |
| fs_reg payload, payload2; |
| unsigned mlen, ex_mlen = 0; |
| if (devinfo->ver >= 9 && |
| (src.file == BAD_FILE || header.file == BAD_FILE)) { |
| /* We have split sends on gfx9 and above */ |
| if (header.file == BAD_FILE) { |
| payload = bld.move_to_vgrf(addr, addr_sz); |
| payload2 = bld.move_to_vgrf(src, src_sz); |
| mlen = addr_sz * (inst->exec_size / 8); |
| ex_mlen = src_sz * (inst->exec_size / 8); |
| } else { |
| assert(src.file == BAD_FILE); |
| payload = header; |
| payload2 = bld.move_to_vgrf(addr, addr_sz); |
| mlen = header_sz; |
| ex_mlen = addr_sz * (inst->exec_size / 8); |
| } |
| } else { |
| /* Allocate space for the payload. */ |
| const unsigned sz = header_sz + addr_sz + src_sz; |
| payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz); |
| fs_reg *const components = new fs_reg[sz]; |
| unsigned n = 0; |
| |
| /* Construct the payload. */ |
| if (header.file != BAD_FILE) |
| components[n++] = header; |
| |
| for (unsigned i = 0; i < addr_sz; i++) |
| components[n++] = offset(addr, bld, i); |
| |
| for (unsigned i = 0; i < src_sz; i++) |
| components[n++] = offset(src, bld, i); |
| |
| bld.LOAD_PAYLOAD(payload, components, sz, header_sz); |
| mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8; |
| |
| delete[] components; |
| } |
| |
| /* Predicate the instruction on the sample mask if no header is |
| * provided. |
| */ |
| if ((header.file == BAD_FILE || !is_surface_access) && |
| sample_mask.file != BAD_FILE && sample_mask.file != IMM) |
| brw_emit_predicate_on_sample_mask(bld, inst); |
| |
| uint32_t sfid; |
| switch (inst->opcode) { |
| case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: |
| case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: |
| /* Byte scattered opcodes go through the normal data cache */ |
| sfid = GFX7_SFID_DATAPORT_DATA_CACHE; |
| break; |
| |
| case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL: |
| case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL: |
| sfid = devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE : |
| devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE : |
| BRW_DATAPORT_READ_TARGET_RENDER_CACHE; |
| break; |
| |
| case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: |
| case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: |
| case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: |
| case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: |
| /* Untyped Surface messages go through the data cache but the SFID value |
| * changed on Haswell. |
| */ |
| sfid = (devinfo->verx10 >= 75 ? |
| HSW_SFID_DATAPORT_DATA_CACHE_1 : |
| GFX7_SFID_DATAPORT_DATA_CACHE); |
| break; |
| |
| case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: |
| case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: |
| case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: |
| /* Typed surface messages go through the render cache on IVB and the |
| * data cache on HSW+. |
| */ |
| sfid = (devinfo->verx10 >= 75 ? |
| HSW_SFID_DATAPORT_DATA_CACHE_1 : |
| GFX6_SFID_DATAPORT_RENDER_CACHE); |
| break; |
| |
| default: |
| unreachable("Unsupported surface opcode"); |
| } |
| |
| uint32_t desc; |
| switch (inst->opcode) { |
| case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: |
| desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size, |
| arg.ud, /* num_channels */ |
| false /* write */); |
| break; |
| |
| case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: |
| desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size, |
| arg.ud, /* num_channels */ |
| true /* write */); |
| break; |
| |
| case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: |
| desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size, |
| arg.ud, /* bit_size */ |
| false /* write */); |
| break; |
| |
| case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: |
| desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size, |
| arg.ud, /* bit_size */ |
| true /* write */); |
| break; |
| |
| case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL: |
| assert(arg.ud == 32); /* bit_size */ |
| desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size, |
| false /* write */); |
| break; |
| |
| case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL: |
| assert(arg.ud == 32); /* bit_size */ |
| desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size, |
| true /* write */); |
| break; |
| |
| case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: |
| desc = brw_dp_untyped_atomic_desc(devinfo, inst->exec_size, |
| arg.ud, /* atomic_op */ |
| !inst->dst.is_null()); |
| break; |
| |
| case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: |
| desc = brw_dp_untyped_atomic_float_desc(devinfo, inst->exec_size, |
| arg.ud, /* atomic_op */ |
| !inst->dst.is_null()); |
| break; |
| |
| case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: |
| desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group, |
| arg.ud, /* num_channels */ |
| false /* write */); |
| break; |
| |
| case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: |
| desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group, |
| arg.ud, /* num_channels */ |
| true /* write */); |
| break; |
| |
| case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: |
| desc = brw_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group, |
| arg.ud, /* atomic_op */ |
| !inst->dst.is_null()); |
| break; |
| |
| default: |
| unreachable("Unknown surface logical instruction"); |
| } |
| |
| /* Update the original instruction. */ |
| inst->opcode = SHADER_OPCODE_SEND; |
| inst->mlen = mlen; |
| inst->ex_mlen = ex_mlen; |
| inst->header_size = header_sz; |
| inst->send_has_side_effects = has_side_effects; |
| inst->send_is_volatile = !has_side_effects; |
| |
| /* Set up SFID and descriptors */ |
| inst->sfid = sfid; |
| setup_surface_descriptors(bld, inst, desc, surface, surface_handle); |
| |
| inst->resize_sources(4); |
| |
| /* Finally, the payload */ |
| inst->src[2] = payload; |
| inst->src[3] = payload2; |
| } |
| |
| static enum lsc_opcode |
| brw_atomic_op_to_lsc_atomic_op(unsigned op) |
| { |
| switch(op) { |
| case BRW_AOP_AND: |
| return LSC_OP_ATOMIC_AND; |
| case BRW_AOP_OR: |
| return LSC_OP_ATOMIC_OR; |
| case BRW_AOP_XOR: |
| return LSC_OP_ATOMIC_XOR; |
| case BRW_AOP_MOV: |
| return LSC_OP_ATOMIC_STORE; |
| case BRW_AOP_INC: |
| return LSC_OP_ATOMIC_INC; |
| case BRW_AOP_DEC: |
| return LSC_OP_ATOMIC_DEC; |
| case BRW_AOP_ADD: |
| return LSC_OP_ATOMIC_ADD; |
| case BRW_AOP_SUB: |
| return LSC_OP_ATOMIC_SUB; |
| case BRW_AOP_IMAX: |
| return LSC_OP_ATOMIC_MAX; |
| case BRW_AOP_IMIN: |
| return LSC_OP_ATOMIC_MIN; |
| case BRW_AOP_UMAX: |
| return LSC_OP_ATOMIC_UMAX; |
| case BRW_AOP_UMIN: |
| return LSC_OP_ATOMIC_UMIN; |
| case BRW_AOP_CMPWR: |
| return LSC_OP_ATOMIC_CMPXCHG; |
| default: |
| assert(false); |
| unreachable("invalid atomic opcode"); |
| } |
| } |
| |
| static enum lsc_opcode |
| brw_atomic_op_to_lsc_fatomic_op(uint32_t aop) |
| { |
| switch(aop) { |
| case BRW_AOP_FMAX: |
| return LSC_OP_ATOMIC_FMAX; |
| case BRW_AOP_FMIN: |
| return LSC_OP_ATOMIC_FMIN; |
| case BRW_AOP_FCMPWR: |
| return LSC_OP_ATOMIC_FCMPXCHG; |
| case BRW_AOP_FADD: |
| return LSC_OP_ATOMIC_FADD; |
| default: |
| unreachable("Unsupported float atomic opcode"); |
| } |
| } |
| |
| static enum lsc_data_size |
| lsc_bits_to_data_size(unsigned bit_size) |
| { |
| switch (bit_size / 8) { |
| case 1: return LSC_DATA_SIZE_D8U32; |
| case 2: return LSC_DATA_SIZE_D16U32; |
| case 4: return LSC_DATA_SIZE_D32; |
| case 8: return LSC_DATA_SIZE_D64; |
| default: |
| unreachable("Unsupported data size."); |
| } |
| } |
| |
| static void |
| lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst) |
| { |
| const intel_device_info *devinfo = bld.shader->devinfo; |
| assert(devinfo->has_lsc); |
| |
| /* Get the logical send arguments. */ |
| const fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS]; |
| const fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA]; |
| const fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE]; |
| const fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE]; |
| const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS]; |
| const fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG]; |
| const fs_reg allow_sample_mask = |
| inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK]; |
| assert(arg.file == IMM); |
| assert(allow_sample_mask.file == IMM); |
| |
| /* Calculate the total number of components of the payload. */ |
| const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS); |
| const unsigned src_comps = inst->components_read(SURFACE_LOGICAL_SRC_DATA); |
| const unsigned src_sz = type_sz(src.type); |
| const unsigned dst_sz = type_sz(inst->dst.type); |
| |
| const bool has_side_effects = inst->has_side_effects(); |
| |
| unsigned ex_mlen = 0; |
| fs_reg payload, payload2; |
| payload = bld.move_to_vgrf(addr, addr_sz); |
| if (src.file != BAD_FILE) { |
| payload2 = bld.move_to_vgrf(src, src_comps); |
| ex_mlen = (src_comps * src_sz * inst->exec_size) / REG_SIZE; |
| } |
| |
| /* Predicate the instruction on the sample mask if needed */ |
| fs_reg sample_mask = allow_sample_mask.ud ? brw_sample_mask_reg(bld) : |
| fs_reg(brw_imm_d(0xffff)); |
| if (sample_mask.file != BAD_FILE && sample_mask.file != IMM) |
| brw_emit_predicate_on_sample_mask(bld, inst); |
| |
| if (surface.file == IMM && surface.ud == GFX7_BTI_SLM) |
| inst->sfid = GFX12_SFID_SLM; |
| else |
| inst->sfid = GFX12_SFID_UGM; |
| |
| /* We should have exactly one of surface and surface_handle. For scratch |
| * messages generated by brw_fs_nir.cpp we also allow a special value to |
| * know what heap base we should use in STATE_BASE_ADDRESS (SS = Surface |
| * State Offset, or BSS = Bindless Surface State Offset). |
| */ |
| bool non_bindless = surface.file == IMM && surface.ud == GFX125_NON_BINDLESS; |
| assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE) || |
| (non_bindless && surface_handle.file != BAD_FILE)); |
| |
| enum lsc_addr_surface_type surf_type; |
| if (surface_handle.file != BAD_FILE) { |
| assert(surface.file == IMM && (surface.ud == 0 || surface.ud == GFX125_NON_BINDLESS)); |
| surf_type = non_bindless ? LSC_ADDR_SURFTYPE_SS : LSC_ADDR_SURFTYPE_BSS; |
| } else if (surface.file == IMM && surface.ud == GFX7_BTI_SLM) |
| surf_type = LSC_ADDR_SURFTYPE_FLAT; |
| else |
| surf_type = LSC_ADDR_SURFTYPE_BTI; |
| |
| switch (inst->opcode) { |
| case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: |
| inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size, |
| surf_type, LSC_ADDR_SIZE_A32, |
| 1 /* num_coordinates */, |
| LSC_DATA_SIZE_D32, arg.ud /* num_channels */, |
| false /* transpose */, |
| LSC_CACHE_LOAD_L1STATE_L3MOCS, |
| true /* has_dest */); |
| break; |
| case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: |
| inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size, |
| surf_type, LSC_ADDR_SIZE_A32, |
| 1 /* num_coordinates */, |
| LSC_DATA_SIZE_D32, arg.ud /* num_channels */, |
| false /* transpose */, |
| LSC_CACHE_STORE_L1STATE_L3MOCS, |
| false /* has_dest */); |
| break; |
| case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: |
| case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: { |
| /* Bspec: Atomic instruction -> Cache section: |
| * |
| * Atomic messages are always forced to "un-cacheable" in the L1 |
| * cache. |
| */ |
| enum lsc_opcode opcode = |
| inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL ? |
| brw_atomic_op_to_lsc_fatomic_op(arg.ud) : |
| brw_atomic_op_to_lsc_atomic_op(arg.ud); |
| |
| inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size, |
| surf_type, LSC_ADDR_SIZE_A32, |
| 1 /* num_coordinates */, |
| lsc_bits_to_data_size(dst_sz * 8), |
| 1 /* num_channels */, |
| false /* transpose */, |
| LSC_CACHE_STORE_L1UC_L3WB, |
| !inst->dst.is_null()); |
| break; |
| } |
| case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: |
| inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size, |
| surf_type, LSC_ADDR_SIZE_A32, |
| 1 /* num_coordinates */, |
| lsc_bits_to_data_size(arg.ud), |
| 1 /* num_channels */, |
| false /* transpose */, |
| LSC_CACHE_LOAD_L1STATE_L3MOCS, |
| true /* has_dest */); |
| break; |
| case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: |
| inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size, |
| surf_type, LSC_ADDR_SIZE_A32, |
| 1 /* num_coordinates */, |
| lsc_bits_to_data_size(arg.ud), |
| 1 /* num_channels */, |
| false /* transpose */, |
| LSC_CACHE_STORE_L1STATE_L3MOCS, |
| false /* has_dest */); |
| break; |
| default: |
| unreachable("Unknown surface logical instruction"); |
| } |
| |
| inst->src[0] = brw_imm_ud(0); |
| |
| /* Set up extended descriptors */ |
| switch (surf_type) { |
| case LSC_ADDR_SURFTYPE_FLAT: |
| inst->src[1] = brw_imm_ud(0); |
| break; |
| case LSC_ADDR_SURFTYPE_SS: |
| case LSC_ADDR_SURFTYPE_BSS: |
| /* We assume that the driver provided the handle in the top 20 bits so |
| * we can use the surface handle directly as the extended descriptor. |
| */ |
| inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD); |
| break; |
| case LSC_ADDR_SURFTYPE_BTI: |
| if (surface.file == IMM) { |
| inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud)); |
| } else { |
| const fs_builder ubld = bld.exec_all().group(1, 0); |
| fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD); |
| ubld.SHL(tmp, surface, brw_imm_ud(24)); |
| inst->src[1] = component(tmp, 0); |
| } |
| break; |
| default: |
| unreachable("Unknown surface type"); |
| } |
| |
| /* Update the original instruction. */ |
| inst->opcode = SHADER_OPCODE_SEND; |
| inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc); |
| inst->ex_mlen = ex_mlen; |
| inst->header_size = 0; |
| inst->send_has_side_effects = has_side_effects; |
| inst->send_is_volatile = !has_side_effects; |
| |
| inst->resize_sources(4); |
| |
| /* Finally, the payload */ |
| inst->src[2] = payload; |
| inst->src[3] = payload2; |
| } |
| |
| static void |
| lower_lsc_block_logical_send(const fs_builder &bld, fs_inst *inst) |
| { |
| const intel_device_info *devinfo = bld.shader->devinfo; |
| assert(devinfo->has_lsc); |
| |
| /* Get the logical send arguments. */ |
| const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS]; |
| const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA]; |
| const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE]; |
| const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE]; |
| const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG]; |
| assert(arg.file == IMM); |
| assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE); |
| assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE); |
| |
| const bool is_stateless = |
| surface.file == IMM && (surface.ud == BRW_BTI_STATELESS || |
| surface.ud == GFX8_BTI_STATELESS_NON_COHERENT); |
| |
| const bool has_side_effects = inst->has_side_effects(); |
| |
| const bool write = inst->opcode == SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL; |
| |
| fs_builder ubld = bld.exec_all().group(1, 0); |
| fs_reg ex_desc = ubld.vgrf(BRW_REGISTER_TYPE_UD); |
| if (is_stateless) { |
| ubld.AND(ex_desc, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), |
| brw_imm_ud(INTEL_MASK(31, 10))); |
| } else { |
| ubld.MOV(ex_desc, surface_handle); |
| } |
| |
| fs_reg data; |
| if (write) { |
| const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA); |
| data = retype(bld.move_to_vgrf(src, src_sz), BRW_REGISTER_TYPE_UD); |
| } |
| |
| inst->opcode = SHADER_OPCODE_SEND; |
| if (surface.file == IMM && surface.ud == GFX7_BTI_SLM) |
| inst->sfid = GFX12_SFID_SLM; |
| else |
| inst->sfid = GFX12_SFID_UGM; |
| inst->desc = lsc_msg_desc(devinfo, |
| write ? LSC_OP_STORE : LSC_OP_LOAD, |
| 1 /* exec_size */, |
| inst->sfid == GFX12_SFID_SLM ? |
| LSC_ADDR_SURFTYPE_FLAT : LSC_ADDR_SURFTYPE_BSS, |
| LSC_ADDR_SIZE_A32, |
| 1 /* num_coordinates */, |
| LSC_DATA_SIZE_D32, |
| arg.ud /* num_channels */, |
| true /* transpose */, |
| LSC_CACHE_LOAD_L1STATE_L3MOCS, |
| !write /* has_dest */); |
| |
| inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc); |
| inst->size_written = lsc_msg_desc_dest_len(devinfo, inst->desc) * REG_SIZE; |
| inst->exec_size = 1; |
| inst->ex_mlen = write ? DIV_ROUND_UP(arg.ud, 8) : 0; |
| inst->header_size = 0; |
| inst->send_has_side_effects = has_side_effects; |
| inst->send_is_volatile = !has_side_effects; |
| |
| inst->resize_sources(4); |
| |
| inst->src[0] = brw_imm_ud(0); /* desc */ |
| inst->src[1] = ex_desc; /* ex_desc */ |
| inst->src[2] = addr; /* payload */ |
| inst->src[3] = data; /* payload2 */ |
| } |
| |
| static void |
| lower_surface_block_logical_send(const fs_builder &bld, fs_inst *inst) |
| { |
| const intel_device_info *devinfo = bld.shader->devinfo; |
| assert(devinfo->ver >= 9); |
| |
| /* Get the logical send arguments. */ |
| const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS]; |
| const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA]; |
| const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE]; |
| const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE]; |
| const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG]; |
| assert(arg.file == IMM); |
| assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE); |
| assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE); |
| |
| const bool is_stateless = |
| surface.file == IMM && (surface.ud == BRW_BTI_STATELESS || |
| surface.ud == GFX8_BTI_STATELESS_NON_COHERENT); |
| |
| const bool has_side_effects = inst->has_side_effects(); |
| |
| const bool align_16B = |
| inst->opcode != SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL; |
| |
| const bool write = inst->opcode == SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL; |
| |
| /* The address is stored in the header. See MH_A32_GO and MH_BTS_GO. */ |
| fs_builder ubld = bld.exec_all().group(8, 0); |
| fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD); |
| |
| if (is_stateless) |
| ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header); |
| else |
| ubld.MOV(header, brw_imm_d(0)); |
| |
| /* Address in OWord units when aligned to OWords. */ |
| if (align_16B) |
| ubld.group(1, 0).SHR(component(header, 2), addr, brw_imm_ud(4)); |
| else |
| ubld.group(1, 0).MOV(component(header, 2), addr); |
| |
| fs_reg data; |
| unsigned ex_mlen = 0; |
| if (write) { |
| const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA); |
| data = retype(bld.move_to_vgrf(src, src_sz), BRW_REGISTER_TYPE_UD); |
| ex_mlen = src_sz * type_sz(src.type) * inst->exec_size / REG_SIZE; |
| } |
| |
| inst->opcode = SHADER_OPCODE_SEND; |
| inst->mlen = 1; |
| inst->ex_mlen = ex_mlen; |
| inst->header_size = 1; |
| inst->send_has_side_effects = has_side_effects; |
| inst->send_is_volatile = !has_side_effects; |
| |
| inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE; |
| |
| const uint32_t desc = brw_dp_oword_block_rw_desc(devinfo, align_16B, |
| arg.ud, write); |
| setup_surface_descriptors(bld, inst, desc, surface, surface_handle); |
| |
| inst->resize_sources(4); |
| |
| inst->src[2] = header; |
| inst->src[3] = data; |
| } |
| |
| static fs_reg |
| emit_a64_oword_block_header(const fs_builder &bld, const fs_reg &addr) |
| { |
| const fs_builder ubld = bld.exec_all().group(8, 0); |
| |
| assert(type_sz(addr.type) == 8 && addr.stride == 0); |
| |
| fs_reg expanded_addr = addr; |
| if (addr.file == UNIFORM) { |
| /* We can't do stride 1 with the UNIFORM file, it requires stride 0 */ |
| expanded_addr = ubld.vgrf(BRW_REGISTER_TYPE_UQ); |
| expanded_addr.stride = 0; |
| ubld.MOV(expanded_addr, retype(addr, BRW_REGISTER_TYPE_UQ)); |
| } |
| |
| fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD); |
| ubld.MOV(header, brw_imm_ud(0)); |
| |
| /* Use a 2-wide MOV to fill out the address */ |
| fs_reg addr_vec2 = expanded_addr; |
| addr_vec2.type = BRW_REGISTER_TYPE_UD; |
| addr_vec2.stride = 1; |
| ubld.group(2, 0).MOV(header, addr_vec2); |
| |
| return header; |
| } |
| |
| static void |
| emit_fragment_mask(const fs_builder &bld, fs_inst *inst) |
| { |
| assert(inst->src[A64_LOGICAL_ENABLE_HELPERS].file == IMM); |
| const bool enable_helpers = inst->src[A64_LOGICAL_ENABLE_HELPERS].ud; |
| |
| /* If we're a fragment shader, we have to predicate with the sample mask to |
| * avoid helper invocations to avoid helper invocations in instructions |
| * with side effects, unless they are explicitly required. |
| * |
| * There are also special cases when we actually want to run on helpers |
| * (ray queries). |
| */ |
| assert(bld.shader->stage == MESA_SHADER_FRAGMENT); |
| if (enable_helpers) |
| emit_predicate_on_vector_mask(bld, inst); |
| else if (inst->has_side_effects()) |
| brw_emit_predicate_on_sample_mask(bld, inst); |
| } |
| |
| static void |
| lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst) |
| { |
| const intel_device_info *devinfo = bld.shader->devinfo; |
| |
| /* Get the logical send arguments. */ |
| const fs_reg &addr = inst->src[A64_LOGICAL_ADDRESS]; |
| const fs_reg &src = inst->src[A64_LOGICAL_SRC]; |
| const unsigned src_sz = type_sz(src.type); |
| const unsigned dst_sz = type_sz(inst->dst.type); |
| |
| const unsigned src_comps = inst->components_read(1); |
| assert(inst->src[A64_LOGICAL_ARG].file == IMM); |
| const unsigned arg = inst->src[A64_LOGICAL_ARG].ud; |
| const bool has_side_effects = inst->has_side_effects(); |
| |
| fs_reg payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD); |
| fs_reg payload2 = retype(bld.move_to_vgrf(src, src_comps), |
| BRW_REGISTER_TYPE_UD); |
| unsigned ex_mlen = src_comps * src_sz * inst->exec_size / REG_SIZE; |
| |
| switch (inst->opcode) { |
| case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: |
| inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size, |
| LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64, |
| 1 /* num_coordinates */, |
| LSC_DATA_SIZE_D32, arg /* num_channels */, |
| false /* transpose */, |
| LSC_CACHE_LOAD_L1STATE_L3MOCS, |
| true /* has_dest */); |
| break; |
| case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: |
| inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size, |
| LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64, |
| 1 /* num_coordinates */, |
| LSC_DATA_SIZE_D32, arg /* num_channels */, |
| false /* transpose */, |
| LSC_CACHE_STORE_L1STATE_L3MOCS, |
| false /* has_dest */); |
| break; |
| case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL: |
| inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size, |
| LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64, |
| 1 /* num_coordinates */, |
| lsc_bits_to_data_size(arg), |
| 1 /* num_channels */, |
| false /* transpose */, |
| LSC_CACHE_LOAD_L1STATE_L3MOCS, |
| true /* has_dest */); |
| break; |
| case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL: |
| inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size, |
| LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64, |
| 1 /* num_coordinates */, |
| lsc_bits_to_data_size(arg), |
| 1 /* num_channels */, |
| false /* transpose */, |
| LSC_CACHE_STORE_L1STATE_L3MOCS, |
| false /* has_dest */); |
| break; |
| case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL: |
| case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL: |
| case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL: { |
| case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL: |
| case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL: |
| case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL: |
| /* Bspec: Atomic instruction -> Cache section: |
| * |
| * Atomic messages are always forced to "un-cacheable" in the L1 |
| * cache. |
| */ |
| enum lsc_opcode opcode = |
| (inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL || |
| inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL || |
| inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL) ? |
| brw_atomic_op_to_lsc_atomic_op(arg) : |
| brw_atomic_op_to_lsc_fatomic_op(arg); |
| inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size, |
| LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64, |
| 1 /* num_coordinates */, |
| lsc_bits_to_data_size(dst_sz * 8), |
| 1 /* num_channels */, |
| false /* transpose */, |
| LSC_CACHE_STORE_L1UC_L3WB, |
| !inst->dst.is_null()); |
| break; |
| } |
| case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL: |
| case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL: |
| inst->exec_size = 1; |
| inst->desc = lsc_msg_desc(devinfo, |
| LSC_OP_LOAD, |
| 1 /* exec_size */, |
| LSC_ADDR_SURFTYPE_FLAT, |
| LSC_ADDR_SIZE_A64, |
| 1 /* num_coordinates */, |
| LSC_DATA_SIZE_D32, |
| arg /* num_channels */, |
| true /* transpose */, |
| LSC_CACHE_LOAD_L1STATE_L3MOCS, |
| true /* has_dest */); |
| break; |
| case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL: |
| inst->exec_size = 1; |
| inst->desc = lsc_msg_desc(devinfo, |
| LSC_OP_STORE, |
| 1 /* exec_size */, |
| LSC_ADDR_SURFTYPE_FLAT, |
| LSC_ADDR_SIZE_A64, |
| 1 /* num_coordinates */, |
| LSC_DATA_SIZE_D32, |
| arg /* num_channels */, |
| true /* transpose */, |
| LSC_CACHE_LOAD_L1STATE_L3MOCS, |
| false /* has_dest */); |
| |
| break; |
| default: |
| unreachable("Unknown A64 logical instruction"); |
| } |
| |
| if (bld.shader->stage == MESA_SHADER_FRAGMENT) |
| emit_fragment_mask(bld, inst); |
| |
| /* Update the original instruction. */ |
| inst->opcode = SHADER_OPCODE_SEND; |
| inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc); |
| inst->ex_mlen = ex_mlen; |
| inst->header_size = 0; |
| inst->send_has_side_effects = has_side_effects; |
| inst->send_is_volatile = !has_side_effects; |
| |
| /* Set up SFID and descriptors */ |
| inst->sfid = GFX12_SFID_UGM; |
| inst->resize_sources(4); |
| inst->src[0] = brw_imm_ud(0); /* desc */ |
| inst->src[1] = brw_imm_ud(0); /* ex_desc */ |
| inst->src[2] = payload; |
| inst->src[3] = payload2; |
| } |
| |
| static void |
| lower_a64_logical_send(const fs_builder &bld, fs_inst *inst) |
| { |
| const intel_device_info *devinfo = bld.shader->devinfo; |
| |
| const fs_reg &addr = inst->src[A64_LOGICAL_ADDRESS]; |
| const fs_reg &src = inst->src[A64_LOGICAL_SRC]; |
| const unsigned src_comps = inst->components_read(1); |
| assert(inst->src[A64_LOGICAL_ARG].file == IMM); |
| const unsigned arg = inst->src[A64_LOGICAL_ARG].ud; |
| const bool has_side_effects = inst->has_side_effects(); |
| |
| fs_reg payload, payload2; |
| unsigned mlen, ex_mlen = 0, header_size = 0; |
| if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL || |
| inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL || |
| inst->opcode == SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL) { |
| assert(devinfo->ver >= 9); |
| |
| /* OWORD messages only take a scalar address in a header */ |
| mlen = 1; |
| header_size = 1; |
| payload = emit_a64_oword_block_header(bld, addr); |
| |
| if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL) { |
| ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE; |
| payload2 = retype(bld.move_to_vgrf(src, src_comps), |
| BRW_REGISTER_TYPE_UD); |
| } |
| } else if (devinfo->ver >= 9) { |
| /* On Skylake and above, we have SENDS */ |
| mlen = 2 * (inst->exec_size / 8); |
| ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE; |
| payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD); |
| payload2 = retype(bld.move_to_vgrf(src, src_comps), |
| BRW_REGISTER_TYPE_UD); |
| } else { |
| /* Add two because the address is 64-bit */ |
| const unsigned dwords = 2 + src_comps; |
| mlen = dwords * (inst->exec_size / 8); |
| |
| fs_reg sources[5]; |
| |
| sources[0] = addr; |
| |
| for (unsigned i = 0; i < src_comps; i++) |
| sources[1 + i] = offset(src, bld, i); |
| |
| payload = bld.vgrf(BRW_REGISTER_TYPE_UD, dwords); |
| bld.LOAD_PAYLOAD(payload, sources, 1 + src_comps, 0); |
| } |
| |
| uint32_t desc; |
| switch (inst->opcode) { |
| case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: |
| desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size, |
| arg, /* num_channels */ |
| false /* write */); |
| break; |
| |
| case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: |
| desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size, |
| arg, /* num_channels */ |
| true /* write */); |
| break; |
| |
| case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL: |
| desc = brw_dp_a64_oword_block_rw_desc(devinfo, |
| true, /* align_16B */ |
| arg, /* num_dwords */ |
| false /* write */); |
| break; |
| |
| case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL: |
| desc = brw_dp_a64_oword_block_rw_desc(devinfo, |
| false, /* align_16B */ |
| arg, /* num_dwords */ |
| false /* write */); |
| break; |
| |
| case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL: |
| desc = brw_dp_a64_oword_block_rw_desc(devinfo, |
| true, /* align_16B */ |
| arg, /* num_dwords */ |
| true /* write */); |
| break; |
| |
| case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL: |
| desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size, |
| arg, /* bit_size */ |
| false /* write */); |
| break; |
| |
| case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL: |
| desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size, |
| arg, /* bit_size */ |
| true /* write */); |
| break; |
| |
| case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL: |
| desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 32, |
| arg, /* atomic_op */ |
| !inst->dst.is_null()); |
| break; |
| |
| case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL: |
| desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 16, |
| arg, /* atomic_op */ |
| !inst->dst.is_null()); |
| break; |
| |
| case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL: |
| desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 64, |
| arg, /* atomic_op */ |
| !inst->dst.is_null()); |
| break; |
| |
| case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL: |
| desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size, |
| 16, /* bit_size */ |
| arg, /* atomic_op */ |
| !inst->dst.is_null()); |
| break; |
| |
| case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL: |
| desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size, |
| 32, /* bit_size */ |
| arg, /* atomic_op */ |
| !inst->dst.is_null()); |
| break; |
| |
| default: |
| unreachable("Unknown A64 logical instruction"); |
| } |
| |
| if (bld.shader->stage == MESA_SHADER_FRAGMENT) |
| emit_fragment_mask(bld, inst); |
| |
| /* Update the original instruction. */ |
| inst->opcode = SHADER_OPCODE_SEND; |
| inst->mlen = mlen; |
| inst->ex_mlen = ex_mlen; |
| inst->header_size = header_size; |
| inst->send_has_side_effects = has_side_effects; |
| inst->send_is_volatile = !has_side_effects; |
| |
| /* Set up SFID and descriptors */ |
| inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1; |
| inst->desc = desc; |
| inst->resize_sources(4); |
| inst->src[0] = brw_imm_ud(0); /* desc */ |
| inst->src[1] = brw_imm_ud(0); /* ex_desc */ |
| inst->src[2] = payload; |
| inst->src[3] = payload2; |
| } |
| |
| static void |
| lower_lsc_varying_pull_constant_logical_send(const fs_builder &bld, |
| fs_inst *inst) |
| { |
| const intel_device_info *devinfo = bld.shader->devinfo; |
| ASSERTED const brw_compiler *compiler = bld.shader->compiler; |
| |
| fs_reg index = inst->src[0]; |
| |
| /* We are switching the instruction from an ALU-like instruction to a |
| * send-from-grf instruction. Since sends can't handle strides or |
| * source modifiers, we have to make a copy of the offset source. |
| */ |
| fs_reg ubo_offset = bld.move_to_vgrf(inst->src[1], 1); |
| |
| assert(inst->src[2].file == BRW_IMMEDIATE_VALUE); |
| unsigned alignment = inst->src[2].ud; |
| |
| inst->opcode = SHADER_OPCODE_SEND; |
| inst->sfid = GFX12_SFID_UGM; |
| inst->resize_sources(3); |
| inst->src[0] = brw_imm_ud(0); |
| |
| if (index.file == IMM) { |
| inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, index.ud)); |
| } else { |
| const fs_builder ubld = bld.exec_all().group(1, 0); |
| fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD); |
| ubld.SHL(tmp, index, brw_imm_ud(24)); |
| inst->src[1] = component(tmp, 0); |
| } |
| |
| assert(!compiler->indirect_ubos_use_sampler); |
| |
| inst->src[2] = ubo_offset; /* payload */ |
| if (alignment >= 4) { |
| inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size, |
| LSC_ADDR_SURFTYPE_BTI, LSC_ADDR_SIZE_A32, |
| 1 /* num_coordinates */, |
| LSC_DATA_SIZE_D32, |
| 4 /* num_channels */, |
| false /* transpose */, |
| LSC_CACHE_LOAD_L1STATE_L3MOCS, |
| true /* has_dest */); |
| inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc); |
| } else { |
| inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size, |
| LSC_ADDR_SURFTYPE_BTI, LSC_ADDR_SIZE_A32, |
| 1 /* num_coordinates */, |
| LSC_DATA_SIZE_D32, |
| 1 /* num_channels */, |
| false /* transpose */, |
| LSC_CACHE_LOAD_L1STATE_L3MOCS, |
| true /* has_dest */); |
| inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc); |
| /* The byte scattered messages can only read one dword at a time so |
| * we have to duplicate the message 4 times to read the full vec4. |
| * Hopefully, dead code will clean up the mess if some of them aren't |
| * needed. |
| */ |
| assert(inst->size_written == 16 * inst->exec_size); |
| inst->size_written /= 4; |
| for (unsigned c = 1; c < 4; c++) { |
| /* Emit a copy of the instruction because we're about to modify |
| * it. Because this loop starts at 1, we will emit copies for the |
| * first 3 and the final one will be the modified instruction. |
| */ |
| bld.emit(*inst); |
| |
| /* Offset the source */ |
| inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD); |
| bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4)); |
| |
| /* Offset the destination */ |
| inst->dst = offset(inst->dst, bld, 1); |
| } |
| } |
| } |
| |
| static void |
| lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst) |
| { |
| const intel_device_info *devinfo = bld.shader->devinfo; |
| const brw_compiler *compiler = bld.shader->compiler; |
| |
| if (devinfo->ver >= 7) { |
| fs_reg index = inst->src[0]; |
| /* We are switching the instruction from an ALU-like instruction to a |
| * send-from-grf instruction. Since sends can't handle strides or |
| * source modifiers, we have to make a copy of the offset source. |
| */ |
| fs_reg ubo_offset = bld.vgrf(BRW_REGISTER_TYPE_UD); |
| bld.MOV(ubo_offset, inst->src[1]); |
| |
| assert(inst->src[2].file == BRW_IMMEDIATE_VALUE); |
| unsigned alignment = inst->src[2].ud; |
| |
| inst->opcode = SHADER_OPCODE_SEND; |
| inst->mlen = inst->exec_size / 8; |
| inst->resize_sources(3); |
| |
| if (index.file == IMM) { |
| inst->desc = index.ud & 0xff; |
| inst->src[0] = brw_imm_ud(0); |
| } else { |
| inst->desc = 0; |
| const fs_builder ubld = bld.exec_all().group(1, 0); |
| fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD); |
| ubld.AND(tmp, index, brw_imm_ud(0xff)); |
| inst->src[0] = component(tmp, 0); |
| } |
| inst->src[1] = brw_imm_ud(0); /* ex_desc */ |
| inst->src[2] = ubo_offset; /* payload */ |
| |
| if (compiler->indirect_ubos_use_sampler) { |
| const unsigned simd_mode = |
| inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 : |
| BRW_SAMPLER_SIMD_MODE_SIMD16; |
| |
| inst->sfid = BRW_SFID_SAMPLER; |
| inst->desc |= brw_sampler_desc(devinfo, 0, 0, |
| GFX5_SAMPLER_MESSAGE_SAMPLE_LD, |
| simd_mode, 0); |
| } else if (alignment >= 4) { |
| inst->sfid = (devinfo->verx10 >= 75 ? |
| HSW_SFID_DATAPORT_DATA_CACHE_1 : |
| GFX7_SFID_DATAPORT_DATA_CACHE); |
| inst->desc |= brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size, |
| 4, /* num_channels */ |
| false /* write */); |
| } else { |
| inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE; |
| inst->desc |= brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size, |
| 32, /* bit_size */ |
| false /* write */); |
| /* The byte scattered messages can only read one dword at a time so |
| * we have to duplicate the message 4 times to read the full vec4. |
| * Hopefully, dead code will clean up the mess if some of them aren't |
| * needed. |
| */ |
| assert(inst->size_written == 16 * inst->exec_size); |
| inst->size_written /= 4; |
| for (unsigned c = 1; c < 4; c++) { |
| /* Emit a copy of the instruction because we're about to modify |
| * it. Because this loop starts at 1, we will emit copies for the |
| * first 3 and the final one will be the modified instruction. |
| */ |
| bld.emit(*inst); |
| |
| /* Offset the source */ |
| inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD); |
| bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4)); |
| |
| /* Offset the destination */ |
| inst->dst = offset(inst->dst, bld, 1); |
| } |
| } |
| } else { |
| const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->ver), |
| BRW_REGISTER_TYPE_UD); |
| |
| bld.MOV(byte_offset(payload, REG_SIZE), inst->src[1]); |
| |
| inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4; |
| inst->resize_sources(1); |
| inst->base_mrf = payload.nr; |
| inst->header_size = 1; |
| inst->mlen = 1 + inst->exec_size / 8; |
| } |
| } |
| |
| static void |
| lower_math_logical_send(const fs_builder &bld, fs_inst *inst) |
| { |
| assert(bld.shader->devinfo->ver < 6); |
| |
| inst->base_mrf = 2; |
| inst->mlen = inst->sources * inst->exec_size / 8; |
| |
| if (inst->sources > 1) { |
| /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13 |
| * "Message Payload": |
| * |
| * "Operand0[7]. For the INT DIV functions, this operand is the |
| * denominator." |
| * ... |
| * "Operand1[7]. For the INT DIV functions, this operand is the |
| * numerator." |
| */ |
| const bool is_int_div = inst->opcode != SHADER_OPCODE_POW; |
| const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0]; |
| const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1]; |
| |
| inst->resize_sources(1); |
| inst->src[0] = src0; |
| |
| assert(inst->exec_size == 8); |
| bld.MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type), src1); |
| } |
| } |
| |
| static void |
| lower_btd_logical_send(const fs_builder &bld, fs_inst *inst) |
| { |
| const intel_device_info *devinfo = bld.shader->devinfo; |
| fs_reg global_addr = inst->src[0]; |
| const fs_reg &btd_record = inst->src[1]; |
| |
| const unsigned mlen = 2; |
| const fs_builder ubld = bld.exec_all().group(8, 0); |
| fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); |
| |
| ubld.MOV(header, brw_imm_ud(0)); |
| switch (inst->opcode) { |
| case SHADER_OPCODE_BTD_SPAWN_LOGICAL: |
| assert(type_sz(global_addr.type) == 8 && global_addr.stride == 0); |
| global_addr.type = BRW_REGISTER_TYPE_UD; |
| global_addr.stride = 1; |
| ubld.group(2, 0).MOV(header, global_addr); |
| break; |
| |
| case SHADER_OPCODE_BTD_RETIRE_LOGICAL: |
| /* The bottom bit is the Stack ID release bit */ |
| ubld.group(1, 0).MOV(header, brw_imm_ud(1)); |
| break; |
| |
| default: |
| unreachable("Invalid BTD message"); |
| } |
| |
| /* Stack IDs are always in R1 regardless of whether we're coming from a |
| * bindless shader or a regular compute shader. |
| */ |
| fs_reg stack_ids = |
| retype(byte_offset(header, REG_SIZE), BRW_REGISTER_TYPE_UW); |
| bld.MOV(stack_ids, retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW)); |
| |
| unsigned ex_mlen = 0; |
| fs_reg payload; |
| if (inst->opcode == SHADER_OPCODE_BTD_SPAWN_LOGICAL) { |
| ex_mlen = 2 * (inst->exec_size / 8); |
| payload = bld.move_to_vgrf(btd_record, 1); |
| } else { |
| assert(inst->opcode == SHADER_OPCODE_BTD_RETIRE_LOGICAL); |
| /* All these messages take a BTD and things complain if we don't provide |
| * one for RETIRE. However, it shouldn't ever actually get used so fill |
| * it with zero. |
| */ |
| ex_mlen = 2 * (inst->exec_size / 8); |
| payload = bld.move_to_vgrf(brw_imm_uq(0), 1); |
| } |
| |
| /* Update the original instruction. */ |
| inst->opcode = SHADER_OPCODE_SEND; |
| inst->mlen = mlen; |
| inst->ex_mlen = ex_mlen; |
| inst->header_size = 0; /* HW docs require has_header = false */ |
| inst->send_has_side_effects = true; |
| inst->send_is_volatile = false; |
| |
| /* Set up SFID and descriptors */ |
| inst->sfid = GEN_RT_SFID_BINDLESS_THREAD_DISPATCH; |
| inst->desc = brw_btd_spawn_desc(devinfo, inst->exec_size, |
| GEN_RT_BTD_MESSAGE_SPAWN); |
| inst->resize_sources(4); |
| inst->src[0] = brw_imm_ud(0); /* desc */ |
| inst->src[1] = brw_imm_ud(0); /* ex_desc */ |
| inst->src[2] = header; |
| inst->src[3] = payload; |
| } |
| |
| static void |
| lower_trace_ray_logical_send(const fs_builder &bld, fs_inst *inst) |
| { |
| const intel_device_info *devinfo = bld.shader->devinfo; |
| /* The emit_uniformize() in brw_fs_nir.cpp will generate an horizontal |
| * stride of 0. Below we're doing a MOV() in SIMD2. Since we can't use UQ/Q |
| * types in on Gfx12.5, we need to tweak the stride with a value of 1 dword |
| * so that the MOV operates on 2 components rather than twice the same |
| * component. |
| */ |
| fs_reg globals_addr = retype(inst->src[RT_LOGICAL_SRC_GLOBALS], BRW_REGISTER_TYPE_UD); |
| globals_addr.stride = 1; |
| const fs_reg &bvh_level = |
| inst->src[RT_LOGICAL_SRC_BVH_LEVEL].file == BRW_IMMEDIATE_VALUE ? |
| inst->src[RT_LOGICAL_SRC_BVH_LEVEL] : |
| bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_BVH_LEVEL], |
| inst->components_read(RT_LOGICAL_SRC_BVH_LEVEL)); |
| const fs_reg &trace_ray_control = |
| inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL].file == BRW_IMMEDIATE_VALUE ? |
| inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] : |
| bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL], |
| inst->components_read(RT_LOGICAL_SRC_TRACE_RAY_CONTROL)); |
| const fs_reg &synchronous_src = inst->src[RT_LOGICAL_SRC_SYNCHRONOUS]; |
| assert(synchronous_src.file == BRW_IMMEDIATE_VALUE); |
| const bool synchronous = synchronous_src.ud; |
| |
| const unsigned mlen = 1; |
| const fs_builder ubld = bld.exec_all().group(8, 0); |
| fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD); |
| ubld.MOV(header, brw_imm_ud(0)); |
| ubld.group(2, 0).MOV(header, globals_addr); |
| if (synchronous) |
| ubld.group(1, 0).MOV(byte_offset(header, 16), brw_imm_ud(synchronous)); |
| |
| const unsigned ex_mlen = inst->exec_size / 8; |
| fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD); |
| if (bvh_level.file == BRW_IMMEDIATE_VALUE && |
| trace_ray_control.file == BRW_IMMEDIATE_VALUE) { |
| bld.MOV(payload, brw_imm_ud(SET_BITS(trace_ray_control.ud, 9, 8) | |
| (bvh_level.ud & 0x7))); |
| } else { |
| bld.SHL(payload, trace_ray_control, brw_imm_ud(8)); |
| bld.OR(payload, payload, bvh_level); |
| } |
| |
| /* When doing synchronous traversal, the HW implicitly computes the |
| * stack_id using the following formula : |
| * |
| * EUID[3:0] & THREAD_ID[2:0] & SIMD_LANE_ID[3:0] |
| * |
| * Only in the asynchronous case we need to set the stack_id given from the |
| * payload register. |
| */ |
| if (!synchronous) { |
| bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1), |
| retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW), |
| brw_imm_uw(0x7ff)); |
| } |
| |
| /* Update the original instruction. */ |
| inst->opcode = SHADER_OPCODE_SEND; |
| inst->mlen = mlen; |
| inst->ex_mlen = ex_mlen; |
| inst->header_size = 0; /* HW docs require has_header = false */ |
| inst->send_has_side_effects = true; |
| inst->send_is_volatile = false; |
| |
| /* Set up SFID and descriptors */ |
| inst->sfid = GEN_RT_SFID_RAY_TRACE_ACCELERATOR; |
| inst->desc = brw_rt_trace_ray_desc(devinfo, inst->exec_size); |
| inst->resize_sources(4); |
| inst->src[0] = brw_imm_ud(0); /* desc */ |
| inst->src[1] = brw_imm_ud(0); /* ex_desc */ |
| inst->src[2] = header; |
| inst->src[3] = payload; |
| } |
| |
| bool |
| fs_visitor::lower_logical_sends() |
| { |
| bool progress = false; |
| |
| foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { |
| const fs_builder ibld(this, block, inst); |
| |
| switch (inst->opcode) { |
| case FS_OPCODE_FB_WRITE_LOGICAL: |
| assert(stage == MESA_SHADER_FRAGMENT); |
| lower_fb_write_logical_send(ibld, inst, |
| brw_wm_prog_data(prog_data), |
| (const brw_wm_prog_key *)key, |
| fs_payload()); |
| break; |
| |
| case FS_OPCODE_FB_READ_LOGICAL: |
| lower_fb_read_logical_send(ibld, inst); |
| break; |
| |
| case SHADER_OPCODE_TEX_LOGICAL: |
| lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX); |
| break; |
| |
| case SHADER_OPCODE_TXD_LOGICAL: |
| lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD); |
| break; |
| |
| case SHADER_OPCODE_TXF_LOGICAL: |
| lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF); |
| break; |
| |
| case SHADER_OPCODE_TXL_LOGICAL: |
| lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL); |
| break; |
| |
| case SHADER_OPCODE_TXS_LOGICAL: |
| lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS); |
| break; |
| |
| case SHADER_OPCODE_IMAGE_SIZE_LOGICAL: |
| lower_sampler_logical_send(ibld, inst, |
| SHADER_OPCODE_IMAGE_SIZE_LOGICAL); |
| break; |
| |
| case FS_OPCODE_TXB_LOGICAL: |
| lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB); |
| break; |
| |
| case SHADER_OPCODE_TXF_CMS_LOGICAL: |
| lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS); |
| break; |
| |
| case SHADER_OPCODE_TXF_CMS_W_LOGICAL: |
| case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL: |
| lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W); |
| break; |
| |
| case SHADER_OPCODE_TXF_UMS_LOGICAL: |
| lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS); |
| break; |
| |
| case SHADER_OPCODE_TXF_MCS_LOGICAL: |
| lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS); |
| break; |
| |
| case SHADER_OPCODE_LOD_LOGICAL: |
| lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD); |
| break; |
| |
| case SHADER_OPCODE_TG4_LOGICAL: |
| lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4); |
| break; |
| |
| case SHADER_OPCODE_TG4_OFFSET_LOGICAL: |
| lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET); |
| break; |
| |
| case SHADER_OPCODE_SAMPLEINFO_LOGICAL: |
| lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO); |
| break; |
| |
| case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: |
| case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: |
| case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: |
| case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: |
| case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: |
| case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: |
| if (devinfo->has_lsc) { |
| lower_lsc_surface_logical_send(ibld, inst); |
| break; |
| } |
| case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL: |
| case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL: |
| case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: |
| case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: |
| case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: |
| lower_surface_logical_send(ibld, inst); |
| break; |
| |
| case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL: |
| case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL: |
| if (devinfo->has_lsc) { |
| lower_lsc_block_logical_send(ibld, inst); |
| break; |
| } |
| lower_surface_block_logical_send(ibld, inst); |
| break; |
| |
| case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: |
| case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: |
| case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL: |
| case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL: |
| case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL: |
| case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL: |
| case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL: |
| case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL: |
| case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL: |
| case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL: |
| case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL: |
| case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL: |
| case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL: |
| if (devinfo->has_lsc) { |
| lower_lsc_a64_logical_send(ibld, inst); |
| break; |
| } |
| lower_a64_logical_send(ibld, inst); |
| break; |
| |
| case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL: |
| if (devinfo->has_lsc && !compiler->indirect_ubos_use_sampler) |
| lower_lsc_varying_pull_constant_logical_send(ibld, inst); |
| else |
| lower_varying_pull_constant_logical_send(ibld, inst); |
| break; |
| |
| case SHADER_OPCODE_RCP: |
| case SHADER_OPCODE_RSQ: |
| case SHADER_OPCODE_SQRT: |
| case SHADER_OPCODE_EXP2: |
| case SHADER_OPCODE_LOG2: |
| case SHADER_OPCODE_SIN: |
| case SHADER_OPCODE_COS: |
| case SHADER_OPCODE_POW: |
| case SHADER_OPCODE_INT_QUOTIENT: |
| case SHADER_OPCODE_INT_REMAINDER: |
| /* The math opcodes are overloaded for the send-like and |
| * expression-like instructions which seems kind of icky. Gfx6+ has |
| * a native (but rather quirky) MATH instruction so we don't need to |
| * do anything here. On Gfx4-5 we'll have to lower the Gfx6-like |
| * logical instructions (which we can easily recognize because they |
| * have mlen = 0) into send-like virtual instructions. |
| */ |
| if (devinfo->ver < 6 && inst->mlen == 0) { |
| lower_math_logical_send(ibld, inst); |
| break; |
| |
| } else { |
| continue; |
| } |
| |
| case SHADER_OPCODE_BTD_SPAWN_LOGICAL: |
| case SHADER_OPCODE_BTD_RETIRE_LOGICAL: |
| lower_btd_logical_send(ibld, inst); |
| break; |
| |
| case RT_OPCODE_TRACE_RAY_LOGICAL: |
| lower_trace_ray_logical_send(ibld, inst); |
| break; |
| |
| case SHADER_OPCODE_URB_READ_LOGICAL: |
| lower_urb_read_logical_send(ibld, inst); |
| break; |
| |
| case SHADER_OPCODE_URB_WRITE_LOGICAL: |
| lower_urb_write_logical_send(ibld, inst); |
| break; |
| |
| default: |
| continue; |
| } |
| |
| progress = true; |
| } |
| |
| if (progress) |
| invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); |
| |
| return progress; |
| } |