| /* |
| * Copyright © 2010 Intel Corporation |
| * SPDX-License-Identifier: MIT |
| */ |
| |
| #include "brw_shader.h" |
| #include "brw_builder.h" |
| |
| /* Wa_14015360517 |
| * |
| * The first instruction of any kernel should have non-zero emask. |
| * Make sure this happens by introducing a dummy mov instruction. |
| */ |
| bool |
| brw_workaround_emit_dummy_mov_instruction(brw_shader &s) |
| { |
| if (!intel_needs_workaround(s.devinfo, 14015360517)) |
| return false; |
| |
| brw_inst *first_inst = |
| s.cfg->first_block()->start(); |
| |
| /* We can skip the WA if first instruction is marked with |
| * force_writemask_all or exec_size equals dispatch_width. |
| */ |
| if (first_inst->force_writemask_all || |
| first_inst->exec_size == s.dispatch_width) |
| return false; |
| |
| /* Insert dummy mov as first instruction. */ |
| const brw_builder ubld = brw_builder(first_inst).exec_all().group(8, 0); |
| ubld.MOV(ubld.null_reg_ud(), brw_imm_ud(0u)); |
| |
| s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS | |
| BRW_DEPENDENCY_VARIABLES); |
| return true; |
| } |
| |
| static bool |
| needs_dummy_fence(const intel_device_info *devinfo, brw_inst *inst) |
| { |
| /* This workaround is about making sure that any instruction writing |
| * through UGM has completed before we hit EOT. |
| */ |
| if (inst->sfid != BRW_SFID_UGM) |
| return false; |
| |
| /* Any UGM, non-Scratch-surface Stores (not including Atomic) messages, |
| * where the L1-cache override is NOT among {WB, WS, WT} |
| */ |
| enum lsc_opcode opcode = lsc_msg_desc_opcode(devinfo, inst->desc); |
| if (lsc_opcode_is_store(opcode)) { |
| switch (lsc_msg_desc_cache_ctrl(devinfo, inst->desc)) { |
| case LSC_CACHE_STORE_L1STATE_L3MOCS: |
| case LSC_CACHE_STORE_L1WB_L3WB: |
| case LSC_CACHE_STORE_L1S_L3UC: |
| case LSC_CACHE_STORE_L1S_L3WB: |
| case LSC_CACHE_STORE_L1WT_L3UC: |
| case LSC_CACHE_STORE_L1WT_L3WB: |
| return false; |
| |
| default: |
| return true; |
| } |
| } |
| |
| /* Any UGM Atomic message WITHOUT return value */ |
| if (lsc_opcode_is_atomic(opcode) && inst->dst.is_null()) |
| return true; |
| |
| return false; |
| } |
| |
| /* Wa_22013689345 |
| * |
| * We need to emit UGM fence message before EOT, if shader has any UGM write |
| * or atomic message. |
| * |
| * TODO/FINISHME: According to Curro we could avoid the fence in some cases. |
| * We probably need a better criteria in needs_dummy_fence(). |
| */ |
| bool |
| brw_workaround_memory_fence_before_eot(brw_shader &s) |
| { |
| bool progress = false; |
| bool has_ugm_write_or_atomic = false; |
| |
| if (!intel_needs_workaround(s.devinfo, 22013689345)) |
| return false; |
| |
| /* Needs to happen after brw_lower_logical_sends & before |
| * brw_lower_send_descriptors. |
| */ |
| assert(s.phase == BRW_SHADER_PHASE_AFTER_MIDDLE_LOWERING); |
| |
| foreach_block_and_inst_safe (block, brw_inst, inst, s.cfg) { |
| if (!inst->eot) { |
| if (needs_dummy_fence(s.devinfo, inst)) |
| has_ugm_write_or_atomic = true; |
| continue; |
| } |
| |
| if (!has_ugm_write_or_atomic) |
| break; |
| |
| const brw_builder ubld = brw_builder(inst).uniform(); |
| |
| brw_reg dst = ubld.vgrf(BRW_TYPE_UD); |
| brw_inst *dummy_fence = ubld.emit(SHADER_OPCODE_SEND, dst); |
| |
| dummy_fence->resize_sources(4); |
| dummy_fence->src[0] = brw_imm_ud(0); |
| dummy_fence->src[1] = brw_imm_ud(0); |
| dummy_fence->src[2] = brw_vec8_grf(0, 0); |
| dummy_fence->src[3] = brw_reg(); |
| dummy_fence->mlen = reg_unit(s.devinfo); |
| dummy_fence->ex_mlen = 0; |
| dummy_fence->sfid = BRW_SFID_UGM; |
| dummy_fence->desc = lsc_fence_msg_desc(s.devinfo, LSC_FENCE_TILE, |
| LSC_FLUSH_TYPE_NONE_6, false); |
| dummy_fence->size_written = REG_SIZE * reg_unit(s.devinfo); |
| ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), dst); |
| progress = true; |
| /* TODO: remove this break if we ever have shader with multiple EOT. */ |
| break; |
| } |
| |
| if (progress) { |
| s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS | |
| BRW_DEPENDENCY_VARIABLES); |
| } |
| |
| return progress; |
| } |
| |
| /** |
| * Find the first instruction in the program that might start a region of |
| * divergent control flow due to a HALT jump. There is no |
| * find_halt_control_flow_region_end(), the region of divergence extends until |
| * the only SHADER_OPCODE_HALT_TARGET in the program. |
| */ |
| static const brw_inst * |
| find_halt_control_flow_region_start(const brw_shader *v) |
| { |
| foreach_block_and_inst(block, brw_inst, inst, v->cfg) { |
| if (inst->opcode == BRW_OPCODE_HALT || |
| inst->opcode == SHADER_OPCODE_HALT_TARGET) |
| return inst; |
| } |
| |
| return NULL; |
| } |
| |
| /** |
| * Work around the Gfx12 hardware bug filed as Wa_1407528679. EU fusion |
| * can cause a BB to be executed with all channels disabled, which will lead |
| * to the execution of any NoMask instructions in it, even though any |
| * execution-masked instructions will be correctly shot down. This may break |
| * assumptions of some NoMask SEND messages whose descriptor depends on data |
| * generated by live invocations of the shader. |
| * |
| * This avoids the problem by predicating certain instructions on an ANY |
| * horizontal predicate that makes sure that their execution is omitted when |
| * all channels of the program are disabled. |
| */ |
| bool |
| brw_workaround_nomask_control_flow(brw_shader &s) |
| { |
| if (s.devinfo->ver != 12) |
| return false; |
| |
| const brw_predicate pred = s.dispatch_width > 16 ? BRW_PREDICATE_ALIGN1_ANY32H : |
| s.dispatch_width > 8 ? BRW_PREDICATE_ALIGN1_ANY16H : |
| BRW_PREDICATE_ALIGN1_ANY8H; |
| const brw_inst *halt_start = find_halt_control_flow_region_start(&s); |
| unsigned depth = 0; |
| bool progress = false; |
| |
| const brw_live_variables &live_vars = s.live_analysis.require(); |
| |
| /* Scan the program backwards in order to be able to easily determine |
| * whether the flag register is live at any point. |
| */ |
| foreach_block_reverse_safe(block, s.cfg) { |
| BITSET_WORD flag_liveout = live_vars.block_data[block->num] |
| .flag_liveout[0]; |
| STATIC_ASSERT(ARRAY_SIZE(live_vars.block_data[0].flag_liveout) == 1); |
| |
| foreach_inst_in_block_reverse_safe(brw_inst, inst, block) { |
| if (!inst->predicate && inst->exec_size >= 8) |
| flag_liveout &= ~inst->flags_written(s.devinfo); |
| |
| switch (inst->opcode) { |
| case BRW_OPCODE_DO: |
| case BRW_OPCODE_IF: |
| /* Note that this doesn't handle BRW_OPCODE_HALT since only |
| * the first one in the program closes the region of divergent |
| * control flow due to any HALT instructions -- Instead this is |
| * handled with the halt_start check below. |
| */ |
| depth--; |
| break; |
| |
| case BRW_OPCODE_WHILE: |
| case BRW_OPCODE_ENDIF: |
| case SHADER_OPCODE_HALT_TARGET: |
| depth++; |
| break; |
| |
| default: |
| /* Note that the vast majority of NoMask SEND instructions in the |
| * program are harmless while executed in a block with all |
| * channels disabled, since any instructions with side effects we |
| * could hit here should be execution-masked. |
| * |
| * The main concern is NoMask SEND instructions where the message |
| * descriptor or header depends on data generated by live |
| * invocations of the shader (RESINFO and |
| * FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically |
| * computed surface index seem to be the only examples right now |
| * where this could easily lead to GPU hangs). Unfortunately we |
| * have no straightforward way to detect that currently, so just |
| * predicate any NoMask SEND instructions we find under control |
| * flow. |
| * |
| * If this proves to have a measurable performance impact it can |
| * be easily extended with a whitelist of messages we know we can |
| * safely omit the predication for. |
| */ |
| if (depth && inst->force_writemask_all && |
| is_send(inst) && !inst->predicate && |
| !inst->has_no_mask_send_params) { |
| /* We need to load the execution mask into the flag register by |
| * using a builder with channel group matching the whole shader |
| * (rather than the default which is derived from the original |
| * instruction), in order to avoid getting a right-shifted |
| * value. |
| */ |
| const brw_builder ubld = brw_builder(inst) |
| .exec_all().group(s.dispatch_width, 0); |
| const brw_reg flag = retype(brw_flag_reg(0, 0), |
| BRW_TYPE_UD); |
| |
| /* Due to the lack of flag register allocation we need to save |
| * and restore the flag register if it's live. |
| */ |
| const bool save_flag = flag_liveout & |
| brw_flag_mask(flag, s.dispatch_width / 8); |
| const brw_reg tmp = ubld.group(8, 0).vgrf(flag.type); |
| |
| if (save_flag) { |
| ubld.group(8, 0).UNDEF(tmp); |
| ubld.group(1, 0).MOV(tmp, flag); |
| } |
| |
| ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS); |
| |
| set_predicate(pred, inst); |
| inst->flag_subreg = 0; |
| inst->predicate_trivial = true; |
| |
| if (save_flag) |
| ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp); |
| |
| progress = true; |
| } |
| break; |
| } |
| |
| if (inst == halt_start) |
| depth--; |
| |
| flag_liveout |= inst->flags_read(s.devinfo); |
| } |
| } |
| |
| if (progress) |
| s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS | |
| BRW_DEPENDENCY_VARIABLES); |
| |
| return progress; |
| } |
| |
| /** |
| * flags_read() and flags_written() return flag access with byte granularity, |
| * but for Flag Register PRM lists "Access Granularity: Word", so we can assume |
| * accessing any part of a word will clear its register dependency. |
| */ |
| static unsigned |
| bytes_bitmask_to_words(unsigned b) |
| { |
| unsigned first_byte_mask = b & 0x55555555; |
| unsigned second_byte_mask = b & 0xaaaaaaaa; |
| return first_byte_mask | |
| (first_byte_mask << 1) | |
| second_byte_mask | |
| (second_byte_mask >> 1); |
| } |
| |
| /** |
| * WaClearArfDependenciesBeforeEot |
| * |
| * Flag register dependency not cleared after EOT, so we have to source them |
| * before EOT. We can do this with simple `mov(1) nullUD, f{0,1}UD` |
| * |
| * To avoid emitting MOVs when it's not needed, check if each block reads all |
| * the flags it sets. We might falsely determine register as unread if it'll be |
| * accessed inside the next blocks, but this still should be good enough. |
| */ |
| bool |
| brw_workaround_source_arf_before_eot(brw_shader &s) |
| { |
| bool progress = false; |
| |
| if (s.devinfo->ver != 9) |
| return false; |
| |
| unsigned flags_unread = 0; |
| |
| foreach_block(block, s.cfg) { |
| unsigned flags_unread_in_block = 0; |
| |
| foreach_inst_in_block(brw_inst, inst, block) { |
| /* Instruction can read and write to the same flag, so the order is important */ |
| flags_unread_in_block &= ~bytes_bitmask_to_words(inst->flags_read(s.devinfo)); |
| flags_unread_in_block |= bytes_bitmask_to_words(inst->flags_written(s.devinfo)); |
| |
| /* HALT does not start its block even though it can leave a dependency */ |
| if (inst->opcode == BRW_OPCODE_HALT || |
| inst->opcode == SHADER_OPCODE_HALT_TARGET) { |
| flags_unread |= flags_unread_in_block; |
| flags_unread_in_block = 0; |
| } |
| } |
| |
| flags_unread |= flags_unread_in_block; |
| |
| if ((flags_unread & 0x0f) && (flags_unread & 0xf0)) |
| break; |
| } |
| |
| if (flags_unread) { |
| int eot_count = 0; |
| |
| foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) |
| { |
| if (!inst->eot) |
| continue; |
| |
| /* Currently, we always emit only one EOT per program, |
| * this WA should be updated if it ever changes. |
| */ |
| assert(++eot_count == 1); |
| |
| const brw_builder ubld = brw_builder(inst).uniform(); |
| |
| if (flags_unread & 0x0f) |
| ubld.MOV(ubld.null_reg_ud(), retype(brw_flag_reg(0, 0), BRW_TYPE_UD)); |
| |
| if (flags_unread & 0xf0) |
| ubld.MOV(ubld.null_reg_ud(), retype(brw_flag_reg(1, 0), BRW_TYPE_UD)); |
| } |
| |
| progress = true; |
| s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS); |
| } |
| |
| return progress; |
| } |