src/freedreno/ir3/ir3_delay.c - third_party/mesa - Git at Google

 /*
  * Copyright (C) 2019 Google, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  *
  * Authors:
  *    Rob Clark <robclark@freedesktop.org>
  */

 #include "ir3.h"

 /* The maximum number of nop's we may need to insert between two instructions.
  */
 #define MAX_NOPS 6

 /*
  * Helpers to figure out the necessary delay slots between instructions.  Used
  * both in scheduling pass(es) and the final pass to insert any required nop's
  * so that the shader program is valid.
  *
  * Note that this needs to work both pre and post RA, so we can't assume ssa
  * src iterators work.
  */

 /* calculate required # of delay slots between the instruction that
  * assigns a value and the one that consumes
  */
 int
 ir3_delayslots(struct ir3_instruction *assigner,
                struct ir3_instruction *consumer, unsigned n, bool soft)
 {
    /* generally don't count false dependencies, since this can just be
     * something like a barrier, or SSBO store.
     */
    if (__is_false_dep(consumer, n))
       return 0;

    /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
     * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
     * handled with sync bits
     */

    if (is_meta(assigner) || is_meta(consumer))
       return 0;

    if (writes_addr0(assigner) || writes_addr1(assigner))
       return 6;

    if (soft && is_ss_producer(assigner))
       return soft_ss_delay(assigner);

    /* handled via sync flags: */
    if (is_ss_producer(assigner) || is_sy_producer(assigner))
       return 0;

    /* As far as we know, shader outputs don't need any delay. */
    if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK)
       return 0;

    /* assigner must be alu: */
    if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
        is_mem(consumer)) {
       return 6;
    } else {
       /* In mergedregs mode, there is an extra 2-cycle penalty when half of
        * a full-reg is read as a half-reg or when a half-reg is read as a
        * full-reg.
        */
       bool mismatched_half = (assigner->dsts[0]->flags & IR3_REG_HALF) !=
                              (consumer->srcs[n]->flags & IR3_REG_HALF);
       unsigned penalty = mismatched_half ? 3 : 0;
       if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) && (n == 2)) {
          /* special case, 3rd src to cat3 not required on first cycle */
          return 1 + penalty;
       } else {
          return 3 + penalty;
       }
    }
 }

 static bool
 count_instruction(struct ir3_instruction *n)
 {
    /* NOTE: don't count branch/jump since we don't know yet if they will
     * be eliminated later in resolve_jumps().. really should do that
     * earlier so we don't have this constraint.
     */
    return is_alu(n) ||
           (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
 }

 /* Post-RA, we don't have arrays any more, so we have to be a bit careful here
  * and have to handle relative accesses specially.
  */

 static unsigned
 post_ra_reg_elems(struct ir3_register *reg)
 {
    if (reg->flags & IR3_REG_RELATIV)
       return reg->size;
    return reg_elems(reg);
 }

 static unsigned
 post_ra_reg_num(struct ir3_register *reg)
 {
    if (reg->flags & IR3_REG_RELATIV)
       return reg->array.base;
    return reg->num;
 }

 unsigned
 ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
                            struct ir3_instruction *consumer,
                            unsigned assigner_n, unsigned consumer_n)
 {
    unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, false);

    struct ir3_register *src = consumer->srcs[consumer_n];
    struct ir3_register *dst = assigner->dsts[assigner_n];

    if (assigner->repeat == 0 && consumer->repeat == 0)
       return delay;

    unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
    unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);

    /* If either side is a relative access, we can't really apply most of the
     * reasoning below because we don't know which component aliases which.
     * Just bail in this case.
     */
    if ((src->flags & IR3_REG_RELATIV) || (dst->flags & IR3_REG_RELATIV))
       return delay;

    /* MOVMSK seems to require that all users wait until the entire
     * instruction is finished, so just bail here.
     */
    if (assigner->opc == OPC_MOVMSK)
       return delay;

    bool mismatched_half =
       (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);

    /* TODO: Handle the combination of (rpt) and different component sizes
     * better like below. This complicates things significantly because the
     * components don't line up.
     */
    if (mismatched_half)
       return delay;

    /* If an instruction has a (rpt), then it acts as a sequence of
     * instructions, reading its non-(r) sources at each cycle. First, get the
     * register num for the first instruction where they interfere:
     */

    unsigned first_num = MAX2(src_start, dst_start) / reg_elem_size(dst);

    /* Now, for that first conflicting half/full register, figure out the
     * sub-instruction within assigner/consumer it corresponds to. For (r)
     * sources, this should already return the correct answer of 0. However we
     * have to special-case the multi-mov instructions, where the
     * sub-instructions sometimes come from the src/dst indices instead.
     */
    unsigned first_src_instr;
    if (consumer->opc == OPC_SWZ || consumer->opc == OPC_GAT)
       first_src_instr = consumer_n;
    else
       first_src_instr = first_num - src->num;

    unsigned first_dst_instr;
    if (assigner->opc == OPC_SWZ || assigner->opc == OPC_SCT)
       first_dst_instr = assigner_n;
    else
       first_dst_instr = first_num - dst->num;

    /* The delay we return is relative to the *end* of assigner and the
     * *beginning* of consumer, because it's the number of nops (or other
     * things) needed between them. Any instructions after first_dst_instr
     * subtract from the delay, and so do any instructions before
     * first_src_instr. Calculate an offset to subtract from the non-rpt-aware
     * delay to account for that.
     *
     * Now, a priori, we need to go through this process for every
     * conflicting regnum and take the minimum of the offsets to make sure
     * that the appropriate number of nop's is inserted for every conflicting
     * pair of sub-instructions. However, as we go to the next conflicting
     * regnum (if any), the number of instructions after first_dst_instr
     * decreases by 1 and the number of source instructions before
     * first_src_instr correspondingly increases by 1, so the offset stays the
     * same for all conflicting registers.
     */
    unsigned offset = first_src_instr + (assigner->repeat - first_dst_instr);
    return offset > delay ? 0 : delay - offset;
 }

 static unsigned
 delay_calc_srcn(struct ir3_instruction *assigner,
                 struct ir3_instruction *consumer, unsigned assigner_n,
                 unsigned consumer_n, bool mergedregs)
 {
    struct ir3_register *src = consumer->srcs[consumer_n];
    struct ir3_register *dst = assigner->dsts[assigner_n];
    bool mismatched_half =
       (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);

    /* In the mergedregs case or when the register is a special register,
     * half-registers do not alias with full registers.
     */
    if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) &&
        mismatched_half)
       return 0;

    unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
    unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);
    unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
    unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);

    if (dst_start >= src_end || src_start >= dst_end)
       return 0;

    return ir3_delayslots_with_repeat(assigner, consumer, assigner_n, consumer_n);
 }

 static unsigned
 delay_calc(struct ir3_block *block, struct ir3_instruction *start,
            struct ir3_instruction *consumer, unsigned distance,
            regmask_t *in_mask, bool mergedregs)
 {
    regmask_t mask;
    memcpy(&mask, in_mask, sizeof(mask));

    unsigned delay = 0;
    /* Search backwards starting at the instruction before start, unless it's
     * NULL then search backwards from the block end.
     */
    struct list_head *start_list =
       start ? start->node.prev : block->instr_list.prev;
    list_for_each_entry_from_rev (struct ir3_instruction, assigner, start_list,
                                  &block->instr_list, node) {
       if (count_instruction(assigner))
          distance += assigner->nop;

       if (distance + delay >= MAX_NOPS)
          return delay;

       if (is_meta(assigner))
          continue;

       unsigned new_delay = 0;

       foreach_dst_n (dst, dst_n, assigner) {
          if (dst->wrmask == 0)
             continue;
          if (!regmask_get(&mask, dst))
             continue;
          foreach_src_n (src, src_n, consumer) {
             if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
                continue;

             unsigned src_delay = delay_calc_srcn(
                assigner, consumer, dst_n, src_n, mergedregs);
             new_delay = MAX2(new_delay, src_delay);
          }
          regmask_clear(&mask, dst);
       }

       new_delay = new_delay > distance ? new_delay - distance : 0;
       delay = MAX2(delay, new_delay);

       if (count_instruction(assigner))
          distance += 1 + assigner->repeat;
    }

    /* Note: this allows recursion into "block" if it has already been
     * visited, but *not* recursion into its predecessors. We may have to
     * visit the original block twice, for the loop case where we have to
     * consider definititons in an earlier iterations of the same loop:
     *
     * while (...) {
     *		mov.u32u32 ..., r0.x
     *		...
     *		mov.u32u32 r0.x, ...
     * }
     *
     * However any other recursion would be unnecessary.
     */

    if (block->data != block) {
       block->data = block;

       for (unsigned i = 0; i < block->predecessors_count; i++) {
          struct ir3_block *pred = block->predecessors[i];
          unsigned pred_delay = delay_calc(pred, NULL, consumer, distance,
                                           &mask, mergedregs);
          delay = MAX2(delay, pred_delay);
       }

       block->data = NULL;
    }

    return delay;
 }

 /**
  * Calculate delay for nop insertion. This must exactly match hardware
  * requirements, including recursing into predecessor blocks.
  */
 unsigned
 ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
                bool mergedregs)
 {
    regmask_t mask;
    regmask_init(&mask, mergedregs);
    foreach_src (src, instr) {
       if (!(src->flags & (IR3_REG_IMMED | IR3_REG_CONST)))
          regmask_set(&mask, src);
    }

    return delay_calc(block, NULL, instr, 0, &mask, mergedregs);
 }
	/*
	* Copyright (C) 2019 Google, Inc.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*
	* Authors:
	* Rob Clark <robclark@freedesktop.org>
	*/

	#include "ir3.h"

	/* The maximum number of nop's we may need to insert between two instructions.
	*/
	#define MAX_NOPS 6

	/*
	* Helpers to figure out the necessary delay slots between instructions. Used
	* both in scheduling pass(es) and the final pass to insert any required nop's
	* so that the shader program is valid.
	*
	* Note that this needs to work both pre and post RA, so we can't assume ssa
	* src iterators work.
	*/

	/* calculate required # of delay slots between the instruction that
	* assigns a value and the one that consumes
	*/
	int
	ir3_delayslots(struct ir3_instruction *assigner,
	struct ir3_instruction *consumer, unsigned n, bool soft)
	{
	/* generally don't count false dependencies, since this can just be
	* something like a barrier, or SSBO store.
	*/
	if (__is_false_dep(consumer, n))
	return 0;

	/* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
	* alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
	* handled with sync bits
	*/

	if (is_meta(assigner) \|\| is_meta(consumer))
	return 0;

	if (writes_addr0(assigner) \|\| writes_addr1(assigner))
	return 6;

	if (soft && is_ss_producer(assigner))
	return soft_ss_delay(assigner);

	/* handled via sync flags: */
	if (is_ss_producer(assigner) \|\| is_sy_producer(assigner))
	return 0;

	/* As far as we know, shader outputs don't need any delay. */
	if (consumer->opc == OPC_END \|\| consumer->opc == OPC_CHMASK)
	return 0;

	/* assigner must be alu: */
	if (is_flow(consumer) \|\| is_sfu(consumer) \|\| is_tex(consumer) \|\|
	is_mem(consumer)) {
	return 6;
	} else {
	/* In mergedregs mode, there is an extra 2-cycle penalty when half of
	* a full-reg is read as a half-reg or when a half-reg is read as a
	* full-reg.
	*/
	bool mismatched_half = (assigner->dsts[0]->flags & IR3_REG_HALF) !=
	(consumer->srcs[n]->flags & IR3_REG_HALF);
	unsigned penalty = mismatched_half ? 3 : 0;
	if ((is_mad(consumer->opc) \|\| is_madsh(consumer->opc)) && (n == 2)) {
	/* special case, 3rd src to cat3 not required on first cycle */
	return 1 + penalty;
	} else {
	return 3 + penalty;
	}
	}
	}

	static bool
	count_instruction(struct ir3_instruction *n)
	{
	/* NOTE: don't count branch/jump since we don't know yet if they will
	* be eliminated later in resolve_jumps().. really should do that
	* earlier so we don't have this constraint.
	*/
	return is_alu(n) \|\|
	(is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
	}

	/* Post-RA, we don't have arrays any more, so we have to be a bit careful here
	* and have to handle relative accesses specially.
	*/

	static unsigned
	post_ra_reg_elems(struct ir3_register *reg)
	{
	if (reg->flags & IR3_REG_RELATIV)
	return reg->size;
	return reg_elems(reg);
	}

	static unsigned
	post_ra_reg_num(struct ir3_register *reg)
	{
	if (reg->flags & IR3_REG_RELATIV)
	return reg->array.base;
	return reg->num;
	}

	unsigned
	ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
	struct ir3_instruction *consumer,
	unsigned assigner_n, unsigned consumer_n)
	{
	unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, false);

	struct ir3_register *src = consumer->srcs[consumer_n];
	struct ir3_register *dst = assigner->dsts[assigner_n];

	if (assigner->repeat == 0 && consumer->repeat == 0)
	return delay;

	unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
	unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);

	/* If either side is a relative access, we can't really apply most of the
	* reasoning below because we don't know which component aliases which.
	* Just bail in this case.
	*/
	if ((src->flags & IR3_REG_RELATIV) \|\| (dst->flags & IR3_REG_RELATIV))
	return delay;

	/* MOVMSK seems to require that all users wait until the entire
	* instruction is finished, so just bail here.
	*/
	if (assigner->opc == OPC_MOVMSK)
	return delay;

	bool mismatched_half =
	(src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);

	/* TODO: Handle the combination of (rpt) and different component sizes
	* better like below. This complicates things significantly because the
	* components don't line up.
	*/
	if (mismatched_half)
	return delay;

	/* If an instruction has a (rpt), then it acts as a sequence of
	* instructions, reading its non-(r) sources at each cycle. First, get the
	* register num for the first instruction where they interfere:
	*/

	unsigned first_num = MAX2(src_start, dst_start) / reg_elem_size(dst);

	/* Now, for that first conflicting half/full register, figure out the
	* sub-instruction within assigner/consumer it corresponds to. For (r)
	* sources, this should already return the correct answer of 0. However we
	* have to special-case the multi-mov instructions, where the
	* sub-instructions sometimes come from the src/dst indices instead.
	*/
	unsigned first_src_instr;
	if (consumer->opc == OPC_SWZ \|\| consumer->opc == OPC_GAT)
	first_src_instr = consumer_n;
	else
	first_src_instr = first_num - src->num;

	unsigned first_dst_instr;
	if (assigner->opc == OPC_SWZ \|\| assigner->opc == OPC_SCT)
	first_dst_instr = assigner_n;
	else
	first_dst_instr = first_num - dst->num;

	/* The delay we return is relative to the end of assigner and the
	* beginning of consumer, because it's the number of nops (or other
	* things) needed between them. Any instructions after first_dst_instr
	* subtract from the delay, and so do any instructions before
	* first_src_instr. Calculate an offset to subtract from the non-rpt-aware
	* delay to account for that.
	*
	* Now, a priori, we need to go through this process for every
	* conflicting regnum and take the minimum of the offsets to make sure
	* that the appropriate number of nop's is inserted for every conflicting
	* pair of sub-instructions. However, as we go to the next conflicting
	* regnum (if any), the number of instructions after first_dst_instr
	* decreases by 1 and the number of source instructions before
	* first_src_instr correspondingly increases by 1, so the offset stays the
	* same for all conflicting registers.
	*/
	unsigned offset = first_src_instr + (assigner->repeat - first_dst_instr);
	return offset > delay ? 0 : delay - offset;
	}

	static unsigned
	delay_calc_srcn(struct ir3_instruction *assigner,
	struct ir3_instruction *consumer, unsigned assigner_n,
	unsigned consumer_n, bool mergedregs)
	{
	struct ir3_register *src = consumer->srcs[consumer_n];
	struct ir3_register *dst = assigner->dsts[assigner_n];
	bool mismatched_half =
	(src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);

	/* In the mergedregs case or when the register is a special register,
	* half-registers do not alias with full registers.
	*/
	if ((!mergedregs \|\| is_reg_special(src) \|\| is_reg_special(dst)) &&
	mismatched_half)
	return 0;

	unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
	unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);
	unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
	unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);

	if (dst_start >= src_end \|\| src_start >= dst_end)
	return 0;

	return ir3_delayslots_with_repeat(assigner, consumer, assigner_n, consumer_n);
	}

	static unsigned
	delay_calc(struct ir3_block block, struct ir3_instruction start,
	struct ir3_instruction *consumer, unsigned distance,
	regmask_t *in_mask, bool mergedregs)
	{
	regmask_t mask;
	memcpy(&mask, in_mask, sizeof(mask));

	unsigned delay = 0;
	/* Search backwards starting at the instruction before start, unless it's
	* NULL then search backwards from the block end.
	*/
	struct list_head *start_list =
	start ? start->node.prev : block->instr_list.prev;
	list_for_each_entry_from_rev (struct ir3_instruction, assigner, start_list,
	&block->instr_list, node) {
	if (count_instruction(assigner))
	distance += assigner->nop;

	if (distance + delay >= MAX_NOPS)
	return delay;

	if (is_meta(assigner))
	continue;

	unsigned new_delay = 0;

	foreach_dst_n (dst, dst_n, assigner) {
	if (dst->wrmask == 0)
	continue;
	if (!regmask_get(&mask, dst))
	continue;
	foreach_src_n (src, src_n, consumer) {
	if (src->flags & (IR3_REG_IMMED \| IR3_REG_CONST))
	continue;

	unsigned src_delay = delay_calc_srcn(
	assigner, consumer, dst_n, src_n, mergedregs);
	new_delay = MAX2(new_delay, src_delay);
	}
	regmask_clear(&mask, dst);
	}

	new_delay = new_delay > distance ? new_delay - distance : 0;
	delay = MAX2(delay, new_delay);

	if (count_instruction(assigner))
	distance += 1 + assigner->repeat;
	}

	/* Note: this allows recursion into "block" if it has already been
	* visited, but not recursion into its predecessors. We may have to
	* visit the original block twice, for the loop case where we have to
	* consider definititons in an earlier iterations of the same loop:
	*
	* while (...) {
	* mov.u32u32 ..., r0.x
	* ...
	* mov.u32u32 r0.x, ...
	* }
	*
	* However any other recursion would be unnecessary.
	*/

	if (block->data != block) {
	block->data = block;

	for (unsigned i = 0; i < block->predecessors_count; i++) {
	struct ir3_block *pred = block->predecessors[i];
	unsigned pred_delay = delay_calc(pred, NULL, consumer, distance,
	&mask, mergedregs);
	delay = MAX2(delay, pred_delay);
	}

	block->data = NULL;
	}

	return delay;
	}

	/**
	* Calculate delay for nop insertion. This must exactly match hardware
	* requirements, including recursing into predecessor blocks.
	*/
	unsigned
	ir3_delay_calc(struct ir3_block block, struct ir3_instruction instr,
	bool mergedregs)
	{
	regmask_t mask;
	regmask_init(&mask, mergedregs);
	foreach_src (src, instr) {
	if (!(src->flags & (IR3_REG_IMMED \| IR3_REG_CONST)))
	regmask_set(&mask, src);
	}

	return delay_calc(block, NULL, instr, 0, &mask, mergedregs);
	}