src/intel/compiler/brw_workaround.cpp - third_party/mesa - Git at Google

 /*
  * Copyright © 2010 Intel Corporation
  * SPDX-License-Identifier: MIT
  */

 #include "brw_shader.h"
 #include "brw_builder.h"

 /* Wa_14015360517
  *
  * The first instruction of any kernel should have non-zero emask.
  * Make sure this happens by introducing a dummy mov instruction.
  */
 bool
 brw_workaround_emit_dummy_mov_instruction(brw_shader &s)
 {
    if (!intel_needs_workaround(s.devinfo, 14015360517))
       return false;

    brw_inst *first_inst =
       s.cfg->first_block()->start();

    /* We can skip the WA if first instruction is marked with
     * force_writemask_all or exec_size equals dispatch_width.
     */
    if (first_inst->force_writemask_all ||
        first_inst->exec_size == s.dispatch_width)
       return false;

    /* Insert dummy mov as first instruction. */
    const brw_builder ubld = brw_builder(first_inst).exec_all().group(8, 0);
    ubld.MOV(ubld.null_reg_ud(), brw_imm_ud(0u));

    s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
                          BRW_DEPENDENCY_VARIABLES);
    return true;
 }

 static bool
 needs_dummy_fence(const intel_device_info *devinfo, brw_inst *inst)
 {
    /* This workaround is about making sure that any instruction writing
     * through UGM has completed before we hit EOT.
     */
    if (inst->sfid != BRW_SFID_UGM)
       return false;

    /* Any UGM, non-Scratch-surface Stores (not including Atomic) messages,
     * where the L1-cache override is NOT among {WB, WS, WT}
     */
    enum lsc_opcode opcode = lsc_msg_desc_opcode(devinfo, inst->desc);
    if (lsc_opcode_is_store(opcode)) {
       switch (lsc_msg_desc_cache_ctrl(devinfo, inst->desc)) {
       case LSC_CACHE_STORE_L1STATE_L3MOCS:
       case LSC_CACHE_STORE_L1WB_L3WB:
       case LSC_CACHE_STORE_L1S_L3UC:
       case LSC_CACHE_STORE_L1S_L3WB:
       case LSC_CACHE_STORE_L1WT_L3UC:
       case LSC_CACHE_STORE_L1WT_L3WB:
          return false;

       default:
          return true;
       }
    }

    /* Any UGM Atomic message WITHOUT return value */
    if (lsc_opcode_is_atomic(opcode) && inst->dst.is_null())
       return true;

    return false;
 }

 /* Wa_22013689345
  *
  * We need to emit UGM fence message before EOT, if shader has any UGM write
  * or atomic message.
  *
  * TODO/FINISHME: According to Curro we could avoid the fence in some cases.
  *                We probably need a better criteria in needs_dummy_fence().
  */
 bool
 brw_workaround_memory_fence_before_eot(brw_shader &s)
 {
    bool progress = false;
    bool has_ugm_write_or_atomic = false;

    if (!intel_needs_workaround(s.devinfo, 22013689345))
       return false;

    /* Needs to happen after brw_lower_logical_sends & before
     * brw_lower_send_descriptors.
     */
    assert(s.phase == BRW_SHADER_PHASE_AFTER_MIDDLE_LOWERING);

    foreach_block_and_inst_safe (block, brw_inst, inst, s.cfg) {
       if (!inst->eot) {
          if (needs_dummy_fence(s.devinfo, inst))
             has_ugm_write_or_atomic = true;
          continue;
       }

       if (!has_ugm_write_or_atomic)
          break;

       const brw_builder ubld = brw_builder(inst).uniform();

       brw_reg dst = ubld.vgrf(BRW_TYPE_UD);
       brw_inst *dummy_fence = ubld.emit(SHADER_OPCODE_SEND, dst);

       dummy_fence->resize_sources(4);
       dummy_fence->src[0] = brw_imm_ud(0);
       dummy_fence->src[1] = brw_imm_ud(0);
       dummy_fence->src[2] = brw_vec8_grf(0, 0);
       dummy_fence->src[3] = brw_reg();
       dummy_fence->mlen = reg_unit(s.devinfo);
       dummy_fence->ex_mlen = 0;
       dummy_fence->sfid = BRW_SFID_UGM;
       dummy_fence->desc = lsc_fence_msg_desc(s.devinfo, LSC_FENCE_TILE,
                                              LSC_FLUSH_TYPE_NONE_6, false);
       dummy_fence->size_written = REG_SIZE * reg_unit(s.devinfo);
       ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), dst);
       progress = true;
       /* TODO: remove this break if we ever have shader with multiple EOT. */
       break;
    }

    if (progress) {
       s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
                             BRW_DEPENDENCY_VARIABLES);
    }

    return progress;
 }

 /**
  * Find the first instruction in the program that might start a region of
  * divergent control flow due to a HALT jump.  There is no
  * find_halt_control_flow_region_end(), the region of divergence extends until
  * the only SHADER_OPCODE_HALT_TARGET in the program.
  */
 static const brw_inst *
 find_halt_control_flow_region_start(const brw_shader *v)
 {
    foreach_block_and_inst(block, brw_inst, inst, v->cfg) {
       if (inst->opcode == BRW_OPCODE_HALT ||
           inst->opcode == SHADER_OPCODE_HALT_TARGET)
          return inst;
    }

    return NULL;
 }

 /**
  * Work around the Gfx12 hardware bug filed as Wa_1407528679.  EU fusion
  * can cause a BB to be executed with all channels disabled, which will lead
  * to the execution of any NoMask instructions in it, even though any
  * execution-masked instructions will be correctly shot down.  This may break
  * assumptions of some NoMask SEND messages whose descriptor depends on data
  * generated by live invocations of the shader.
  *
  * This avoids the problem by predicating certain instructions on an ANY
  * horizontal predicate that makes sure that their execution is omitted when
  * all channels of the program are disabled.
  */
 bool
 brw_workaround_nomask_control_flow(brw_shader &s)
 {
    if (s.devinfo->ver != 12)
       return false;

    const brw_predicate pred = s.dispatch_width > 16 ? BRW_PREDICATE_ALIGN1_ANY32H :
                               s.dispatch_width > 8 ? BRW_PREDICATE_ALIGN1_ANY16H :
                               BRW_PREDICATE_ALIGN1_ANY8H;
    const brw_inst *halt_start = find_halt_control_flow_region_start(&s);
    unsigned depth = 0;
    bool progress = false;

    const brw_live_variables &live_vars = s.live_analysis.require();

    /* Scan the program backwards in order to be able to easily determine
     * whether the flag register is live at any point.
     */
    foreach_block_reverse_safe(block, s.cfg) {
       BITSET_WORD flag_liveout = live_vars.block_data[block->num]
                                                .flag_liveout[0];
       STATIC_ASSERT(ARRAY_SIZE(live_vars.block_data[0].flag_liveout) == 1);

       foreach_inst_in_block_reverse_safe(brw_inst, inst, block) {
          if (!inst->predicate && inst->exec_size >= 8)
             flag_liveout &= ~inst->flags_written(s.devinfo);

          switch (inst->opcode) {
          case BRW_OPCODE_DO:
          case BRW_OPCODE_IF:
             /* Note that this doesn't handle BRW_OPCODE_HALT since only
              * the first one in the program closes the region of divergent
              * control flow due to any HALT instructions -- Instead this is
              * handled with the halt_start check below.
              */
             depth--;
             break;

          case BRW_OPCODE_WHILE:
          case BRW_OPCODE_ENDIF:
          case SHADER_OPCODE_HALT_TARGET:
             depth++;
             break;

          default:
             /* Note that the vast majority of NoMask SEND instructions in the
              * program are harmless while executed in a block with all
              * channels disabled, since any instructions with side effects we
              * could hit here should be execution-masked.
              *
              * The main concern is NoMask SEND instructions where the message
              * descriptor or header depends on data generated by live
              * invocations of the shader (RESINFO and
              * FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically
              * computed surface index seem to be the only examples right now
              * where this could easily lead to GPU hangs).  Unfortunately we
              * have no straightforward way to detect that currently, so just
              * predicate any NoMask SEND instructions we find under control
              * flow.
              *
              * If this proves to have a measurable performance impact it can
              * be easily extended with a whitelist of messages we know we can
              * safely omit the predication for.
              */
             if (depth && inst->force_writemask_all &&
                 is_send(inst) && !inst->predicate &&
                 !inst->has_no_mask_send_params) {
                /* We need to load the execution mask into the flag register by
                 * using a builder with channel group matching the whole shader
                 * (rather than the default which is derived from the original
                 * instruction), in order to avoid getting a right-shifted
                 * value.
                 */
                const brw_builder ubld = brw_builder(inst)
                                        .exec_all().group(s.dispatch_width, 0);
                const brw_reg flag = retype(brw_flag_reg(0, 0),
                                           BRW_TYPE_UD);

                /* Due to the lack of flag register allocation we need to save
                 * and restore the flag register if it's live.
                 */
                const bool save_flag = flag_liveout &
                                       brw_flag_mask(flag, s.dispatch_width / 8);
                const brw_reg tmp = ubld.group(8, 0).vgrf(flag.type);

                if (save_flag) {
                   ubld.group(8, 0).UNDEF(tmp);
                   ubld.group(1, 0).MOV(tmp, flag);
                }

                ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS);

                set_predicate(pred, inst);
                inst->flag_subreg = 0;
                inst->predicate_trivial = true;

                if (save_flag)
                   ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp);

                progress = true;
             }
             break;
          }

          if (inst == halt_start)
             depth--;

          flag_liveout |= inst->flags_read(s.devinfo);
       }
    }

    if (progress)
       s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
                             BRW_DEPENDENCY_VARIABLES);

    return progress;
 }

 /**
  * flags_read() and flags_written() return flag access with byte granularity,
  * but for Flag Register PRM lists "Access Granularity: Word", so we can assume
  * accessing any part of a word will clear its register dependency.
  */
 static unsigned
 bytes_bitmask_to_words(unsigned b)
 {
    unsigned first_byte_mask = b & 0x55555555;
    unsigned second_byte_mask = b & 0xaaaaaaaa;
    return first_byte_mask |
           (first_byte_mask << 1) |
           second_byte_mask |
           (second_byte_mask >> 1);
 }

 /**
  * WaClearArfDependenciesBeforeEot
  *
  * Flag register dependency not cleared after EOT, so we have to source them
  * before EOT. We can do this with simple `mov(1) nullUD, f{0,1}UD`
  *
  * To avoid emitting MOVs when it's not needed, check if each block  reads all
  * the flags it sets. We might falsely determine register as unread if it'll be
  * accessed inside the next blocks, but this still should be good enough.
  */
 bool
 brw_workaround_source_arf_before_eot(brw_shader &s)
 {
    bool progress = false;

    if (s.devinfo->ver != 9)
       return false;

    unsigned flags_unread = 0;

    foreach_block(block, s.cfg) {
       unsigned flags_unread_in_block = 0;

       foreach_inst_in_block(brw_inst, inst, block) {
          /* Instruction can read and write to the same flag, so the order is important */
          flags_unread_in_block &= ~bytes_bitmask_to_words(inst->flags_read(s.devinfo));
          flags_unread_in_block |= bytes_bitmask_to_words(inst->flags_written(s.devinfo));

          /* HALT does not start its block even though it can leave a dependency */
          if (inst->opcode == BRW_OPCODE_HALT ||
              inst->opcode == SHADER_OPCODE_HALT_TARGET) {
             flags_unread |= flags_unread_in_block;
             flags_unread_in_block = 0;
          }
       }

       flags_unread |= flags_unread_in_block;

       if ((flags_unread & 0x0f) && (flags_unread & 0xf0))
          break;
    }

    if (flags_unread) {
       int eot_count = 0;

       foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg)
       {
          if (!inst->eot)
             continue;

          /* Currently, we always emit only one EOT per program,
           * this WA should be updated if it ever changes.
           */
          assert(++eot_count == 1);

          const brw_builder ubld = brw_builder(inst).uniform();

          if (flags_unread & 0x0f)
             ubld.MOV(ubld.null_reg_ud(), retype(brw_flag_reg(0, 0), BRW_TYPE_UD));

          if (flags_unread & 0xf0)
             ubld.MOV(ubld.null_reg_ud(), retype(brw_flag_reg(1, 0), BRW_TYPE_UD));
       }

       progress = true;
       s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
    }

    return progress;
 }
	/*
	* Copyright © 2010 Intel Corporation
	* SPDX-License-Identifier: MIT
	*/

	#include "brw_shader.h"
	#include "brw_builder.h"

	/* Wa_14015360517
	*
	* The first instruction of any kernel should have non-zero emask.
	* Make sure this happens by introducing a dummy mov instruction.
	*/
	bool
	brw_workaround_emit_dummy_mov_instruction(brw_shader &s)
	{
	if (!intel_needs_workaround(s.devinfo, 14015360517))
	return false;

	brw_inst *first_inst =
	s.cfg->first_block()->start();

	/* We can skip the WA if first instruction is marked with
	* force_writemask_all or exec_size equals dispatch_width.
	*/
	if (first_inst->force_writemask_all \|\|
	first_inst->exec_size == s.dispatch_width)
	return false;

	/* Insert dummy mov as first instruction. */
	const brw_builder ubld = brw_builder(first_inst).exec_all().group(8, 0);
	ubld.MOV(ubld.null_reg_ud(), brw_imm_ud(0u));

	s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS \|
	BRW_DEPENDENCY_VARIABLES);
	return true;
	}

	static bool
	needs_dummy_fence(const intel_device_info devinfo, brw_inst inst)
	{
	/* This workaround is about making sure that any instruction writing
	* through UGM has completed before we hit EOT.
	*/
	if (inst->sfid != BRW_SFID_UGM)
	return false;

	/* Any UGM, non-Scratch-surface Stores (not including Atomic) messages,
	* where the L1-cache override is NOT among {WB, WS, WT}
	*/
	enum lsc_opcode opcode = lsc_msg_desc_opcode(devinfo, inst->desc);
	if (lsc_opcode_is_store(opcode)) {
	switch (lsc_msg_desc_cache_ctrl(devinfo, inst->desc)) {
	case LSC_CACHE_STORE_L1STATE_L3MOCS:
	case LSC_CACHE_STORE_L1WB_L3WB:
	case LSC_CACHE_STORE_L1S_L3UC:
	case LSC_CACHE_STORE_L1S_L3WB:
	case LSC_CACHE_STORE_L1WT_L3UC:
	case LSC_CACHE_STORE_L1WT_L3WB:
	return false;

	default:
	return true;
	}
	}

	/* Any UGM Atomic message WITHOUT return value */
	if (lsc_opcode_is_atomic(opcode) && inst->dst.is_null())
	return true;

	return false;
	}

	/* Wa_22013689345
	*
	* We need to emit UGM fence message before EOT, if shader has any UGM write
	* or atomic message.
	*
	* TODO/FINISHME: According to Curro we could avoid the fence in some cases.
	* We probably need a better criteria in needs_dummy_fence().
	*/
	bool
	brw_workaround_memory_fence_before_eot(brw_shader &s)
	{
	bool progress = false;
	bool has_ugm_write_or_atomic = false;

	if (!intel_needs_workaround(s.devinfo, 22013689345))
	return false;

	/* Needs to happen after brw_lower_logical_sends & before
	* brw_lower_send_descriptors.
	*/
	assert(s.phase == BRW_SHADER_PHASE_AFTER_MIDDLE_LOWERING);

	foreach_block_and_inst_safe (block, brw_inst, inst, s.cfg) {
	if (!inst->eot) {
	if (needs_dummy_fence(s.devinfo, inst))
	has_ugm_write_or_atomic = true;
	continue;
	}

	if (!has_ugm_write_or_atomic)
	break;

	const brw_builder ubld = brw_builder(inst).uniform();

	brw_reg dst = ubld.vgrf(BRW_TYPE_UD);
	brw_inst *dummy_fence = ubld.emit(SHADER_OPCODE_SEND, dst);

	dummy_fence->resize_sources(4);
	dummy_fence->src[0] = brw_imm_ud(0);
	dummy_fence->src[1] = brw_imm_ud(0);
	dummy_fence->src[2] = brw_vec8_grf(0, 0);
	dummy_fence->src[3] = brw_reg();
	dummy_fence->mlen = reg_unit(s.devinfo);
	dummy_fence->ex_mlen = 0;
	dummy_fence->sfid = BRW_SFID_UGM;
	dummy_fence->desc = lsc_fence_msg_desc(s.devinfo, LSC_FENCE_TILE,
	LSC_FLUSH_TYPE_NONE_6, false);
	dummy_fence->size_written = REG_SIZE * reg_unit(s.devinfo);
	ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), dst);
	progress = true;
	/* TODO: remove this break if we ever have shader with multiple EOT. */
	break;
	}

	if (progress) {
	s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS \|
	BRW_DEPENDENCY_VARIABLES);
	}

	return progress;
	}

	/**
	* Find the first instruction in the program that might start a region of
	* divergent control flow due to a HALT jump. There is no
	* find_halt_control_flow_region_end(), the region of divergence extends until
	* the only SHADER_OPCODE_HALT_TARGET in the program.
	*/
	static const brw_inst *
	find_halt_control_flow_region_start(const brw_shader *v)
	{
	foreach_block_and_inst(block, brw_inst, inst, v->cfg) {
	if (inst->opcode == BRW_OPCODE_HALT \|\|
	inst->opcode == SHADER_OPCODE_HALT_TARGET)
	return inst;
	}

	return NULL;
	}

	/**
	* Work around the Gfx12 hardware bug filed as Wa_1407528679. EU fusion
	* can cause a BB to be executed with all channels disabled, which will lead
	* to the execution of any NoMask instructions in it, even though any
	* execution-masked instructions will be correctly shot down. This may break
	* assumptions of some NoMask SEND messages whose descriptor depends on data
	* generated by live invocations of the shader.
	*
	* This avoids the problem by predicating certain instructions on an ANY
	* horizontal predicate that makes sure that their execution is omitted when
	* all channels of the program are disabled.
	*/
	bool
	brw_workaround_nomask_control_flow(brw_shader &s)
	{
	if (s.devinfo->ver != 12)
	return false;

	const brw_predicate pred = s.dispatch_width > 16 ? BRW_PREDICATE_ALIGN1_ANY32H :
	s.dispatch_width > 8 ? BRW_PREDICATE_ALIGN1_ANY16H :
	BRW_PREDICATE_ALIGN1_ANY8H;
	const brw_inst *halt_start = find_halt_control_flow_region_start(&s);
	unsigned depth = 0;
	bool progress = false;

	const brw_live_variables &live_vars = s.live_analysis.require();

	/* Scan the program backwards in order to be able to easily determine
	* whether the flag register is live at any point.
	*/
	foreach_block_reverse_safe(block, s.cfg) {
	BITSET_WORD flag_liveout = live_vars.block_data[block->num]
	.flag_liveout[0];
	STATIC_ASSERT(ARRAY_SIZE(live_vars.block_data[0].flag_liveout) == 1);

	foreach_inst_in_block_reverse_safe(brw_inst, inst, block) {
	if (!inst->predicate && inst->exec_size >= 8)
	flag_liveout &= ~inst->flags_written(s.devinfo);

	switch (inst->opcode) {
	case BRW_OPCODE_DO:
	case BRW_OPCODE_IF:
	/* Note that this doesn't handle BRW_OPCODE_HALT since only
	* the first one in the program closes the region of divergent
	* control flow due to any HALT instructions -- Instead this is
	* handled with the halt_start check below.
	*/
	depth--;
	break;

	case BRW_OPCODE_WHILE:
	case BRW_OPCODE_ENDIF:
	case SHADER_OPCODE_HALT_TARGET:
	depth++;
	break;

	default:
	/* Note that the vast majority of NoMask SEND instructions in the
	* program are harmless while executed in a block with all
	* channels disabled, since any instructions with side effects we
	* could hit here should be execution-masked.
	*
	* The main concern is NoMask SEND instructions where the message
	* descriptor or header depends on data generated by live
	* invocations of the shader (RESINFO and
	* FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically
	* computed surface index seem to be the only examples right now
	* where this could easily lead to GPU hangs). Unfortunately we
	* have no straightforward way to detect that currently, so just
	* predicate any NoMask SEND instructions we find under control
	* flow.
	*
	* If this proves to have a measurable performance impact it can
	* be easily extended with a whitelist of messages we know we can
	* safely omit the predication for.
	*/
	if (depth && inst->force_writemask_all &&
	is_send(inst) && !inst->predicate &&
	!inst->has_no_mask_send_params) {
	/* We need to load the execution mask into the flag register by
	* using a builder with channel group matching the whole shader
	* (rather than the default which is derived from the original
	* instruction), in order to avoid getting a right-shifted
	* value.
	*/
	const brw_builder ubld = brw_builder(inst)
	.exec_all().group(s.dispatch_width, 0);
	const brw_reg flag = retype(brw_flag_reg(0, 0),
	BRW_TYPE_UD);

	/* Due to the lack of flag register allocation we need to save
	* and restore the flag register if it's live.
	*/
	const bool save_flag = flag_liveout &
	brw_flag_mask(flag, s.dispatch_width / 8);
	const brw_reg tmp = ubld.group(8, 0).vgrf(flag.type);

	if (save_flag) {
	ubld.group(8, 0).UNDEF(tmp);
	ubld.group(1, 0).MOV(tmp, flag);
	}

	ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS);

	set_predicate(pred, inst);
	inst->flag_subreg = 0;
	inst->predicate_trivial = true;

	if (save_flag)
	ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp);

	progress = true;
	}
	break;
	}

	if (inst == halt_start)
	depth--;

	flag_liveout \|= inst->flags_read(s.devinfo);
	}
	}

	if (progress)
	s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS \|
	BRW_DEPENDENCY_VARIABLES);

	return progress;
	}

	/**
	* flags_read() and flags_written() return flag access with byte granularity,
	* but for Flag Register PRM lists "Access Granularity: Word", so we can assume
	* accessing any part of a word will clear its register dependency.
	*/
	static unsigned
	bytes_bitmask_to_words(unsigned b)
	{
	unsigned first_byte_mask = b & 0x55555555;
	unsigned second_byte_mask = b & 0xaaaaaaaa;
	return first_byte_mask \|
	(first_byte_mask << 1) \|
	second_byte_mask \|
	(second_byte_mask >> 1);
	}

	/**
	* WaClearArfDependenciesBeforeEot
	*
	* Flag register dependency not cleared after EOT, so we have to source them
	* before EOT. We can do this with simple `mov(1) nullUD, f{0,1}UD`
	*
	* To avoid emitting MOVs when it's not needed, check if each block reads all
	* the flags it sets. We might falsely determine register as unread if it'll be
	* accessed inside the next blocks, but this still should be good enough.
	*/
	bool
	brw_workaround_source_arf_before_eot(brw_shader &s)
	{
	bool progress = false;

	if (s.devinfo->ver != 9)
	return false;

	unsigned flags_unread = 0;

	foreach_block(block, s.cfg) {
	unsigned flags_unread_in_block = 0;

	foreach_inst_in_block(brw_inst, inst, block) {
	/* Instruction can read and write to the same flag, so the order is important */
	flags_unread_in_block &= ~bytes_bitmask_to_words(inst->flags_read(s.devinfo));
	flags_unread_in_block \|= bytes_bitmask_to_words(inst->flags_written(s.devinfo));

	/* HALT does not start its block even though it can leave a dependency */
	if (inst->opcode == BRW_OPCODE_HALT \|\|
	inst->opcode == SHADER_OPCODE_HALT_TARGET) {
	flags_unread \|= flags_unread_in_block;
	flags_unread_in_block = 0;
	}
	}

	flags_unread \|= flags_unread_in_block;

	if ((flags_unread & 0x0f) && (flags_unread & 0xf0))
	break;
	}

	if (flags_unread) {
	int eot_count = 0;

	foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg)
	{
	if (!inst->eot)
	continue;

	/* Currently, we always emit only one EOT per program,
	* this WA should be updated if it ever changes.
	*/
	assert(++eot_count == 1);

	const brw_builder ubld = brw_builder(inst).uniform();

	if (flags_unread & 0x0f)
	ubld.MOV(ubld.null_reg_ud(), retype(brw_flag_reg(0, 0), BRW_TYPE_UD));

	if (flags_unread & 0xf0)
	ubld.MOV(ubld.null_reg_ud(), retype(brw_flag_reg(1, 0), BRW_TYPE_UD));
	}

	progress = true;
	s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
	}

	return progress;
	}