src/asahi/compiler/agx_lower_parallel_copy.c - third_party/mesa - Git at Google

 /*
  * Copyright 2022 Alyssa Rosenzweig
  * Copyright 2021 Valve Corporation
  * SPDX-License-Identifier: MIT
  */

 #include "agx_builder.h"
 #include "agx_compiler.h"

 UNUSED static void
 print_copy(const struct agx_copy *cp)
 {
    printf("%sr%u = ", cp->dest_mem ? "m" : "", cp->dest);
    agx_print_index(cp->src, false, stdout);
    printf("\n");
 }

 UNUSED static void
 print_copies(const struct agx_copy *copies, unsigned nr)
 {
    printf("[\n");

    for (unsigned i = 0; i < nr; ++i) {
       printf("  ");
       print_copy(&copies[i]);
    }

    printf("]\n");
 }

 /*
  * Emits code for
  *
  *    for (int i = 0; i < n; ++i)
  *       registers[dests[i]] = registers[srcs[i]];
  *
  * ...with all copies happening in parallel.
  *
  * That is, emit machine instructions equivalent to a parallel copy. This is
  * used to lower not only parallel copies but also collects and splits, which
  * also have parallel copy semantics.
  *
  * We only handles register-register copies, not general agx_index sources. This
  * suffices for its internal use for register allocation.
  */

 static void
 do_copy(agx_builder *b, const struct agx_copy *copy)
 {
    agx_index dst = copy->dest_mem
                       ? agx_memory_register(copy->dest, copy->src.size)
                       : agx_register(copy->dest, copy->src.size);

    if (copy->dest_mem && copy->src.memory) {
       /* Memory-memory copies need to be lowered to memory-register and
        * register-memory, using a reserved scratch register.
        */
       agx_index scratch_reg = agx_register(2, copy->src.size);
       agx_mov_to(b, scratch_reg, copy->src);
       agx_mov_to(b, dst, scratch_reg);
    } else if (copy->src.type == AGX_INDEX_IMMEDIATE) {
       agx_mov_imm_to(b, dst, copy->src.value);
    } else {
       agx_mov_to(b, dst, copy->src);
    }
 }

 static void
 do_swap(agx_builder *b, const struct agx_copy *copy)
 {
    assert(copy->src.type == AGX_INDEX_REGISTER && "only GPRs are swapped");

    if (copy->dest == copy->src.value)
       return;

    agx_index x = copy->dest_mem
                     ? agx_memory_register(copy->dest, copy->src.size)
                     : agx_register(copy->dest, copy->src.size);
    agx_index y = copy->src;
    assert(x.memory == y.memory);

    /* Memory-memory swaps lowered here, GPR swaps lowered later */
    if (x.memory) {
       agx_index temp1 = agx_register(4, copy->src.size);
       agx_index temp2 = agx_register(6, copy->src.size);

       agx_mov_to(b, temp1, x);
       agx_mov_to(b, temp2, y);
       agx_mov_to(b, y, temp1);
       agx_mov_to(b, x, temp2);
    } else {
       agx_swap(b, x, y);
    }
 }

 struct copy_ctx {
    /* Number of copies being processed */
    unsigned entry_count;

    /* For each physreg, the number of pending copy entries that use it as a
     * source. Once this drops to zero, then the physreg is unblocked and can
     * be moved to.
     */
    unsigned physreg_use_count[AGX_NUM_MODELED_REGS];

    /* For each physreg, the pending copy_entry that uses it as a dest. */
    struct agx_copy *physreg_dest[AGX_NUM_MODELED_REGS];

    struct agx_copy entries[AGX_NUM_MODELED_REGS];
 };

 static bool
 entry_blocked(struct agx_copy *entry, struct copy_ctx *ctx)
 {
    for (unsigned i = 0; i < agx_size_align_16(entry->src.size); i++) {
       if (ctx->physreg_use_count[entry->dest + i] != 0)
          return true;
    }

    return false;
 }

 static bool
 is_real(struct agx_copy *entry)
 {
    return entry->src.type == AGX_INDEX_REGISTER &&
           entry->dest_mem == entry->src.memory;
 }

 /* TODO: Generalize to other bit sizes */
 static void
 split_32bit_copy(struct copy_ctx *ctx, struct agx_copy *entry)
 {
    assert(!entry->done);
    assert(is_real(entry));
    assert(agx_size_align_16(entry->src.size) == 2);
    struct agx_copy *new_entry = &ctx->entries[ctx->entry_count++];

    new_entry->dest = entry->dest + 1;
    new_entry->dest_mem = entry->dest_mem;
    new_entry->src = entry->src;
    new_entry->src.value += 1;
    new_entry->done = false;
    entry->src.size = AGX_SIZE_16;
    new_entry->src.size = AGX_SIZE_16;
    ctx->physreg_dest[entry->dest + 1] = new_entry;
 }

 static void
 agx_emit_parallel_copies_for_class(agx_builder *b, struct agx_copy *copies,
                                    unsigned num_copies, bool cls)
 {
    /* First, lower away 64-bit copies to smaller chunks, since we don't have
     * 64-bit ALU so we always want to split.
     */
    struct agx_copy *copies2 = calloc(sizeof(copies[0]), num_copies * 2);
    unsigned num_copies2 = 0;

    for (unsigned i = 0; i < num_copies; ++i) {
       struct agx_copy copy = copies[i];

       /* Filter by class */
       if (copy.dest_mem != cls)
          continue;

       assert(copy.dest < AGX_NUM_MODELED_REGS);

       if (copy.src.size == AGX_SIZE_64) {
          copy.src.size = AGX_SIZE_32;
          copies2[num_copies2++] = copy;

          if (copy.src.type == AGX_INDEX_IMMEDIATE) {
             static_assert(sizeof(copy.src.value) * 8 == 32, "known size");
             copy.src.value = 0;
          } else {
             assert(copy.src.type == AGX_INDEX_REGISTER ||
                    copy.src.type == AGX_INDEX_UNIFORM);

             copy.src.value += 2;
          }

          copy.dest += 2;
          copies2[num_copies2++] = copy;
       } else {
          copies2[num_copies2++] = copy;
       }
    }

    copies = copies2;
    num_copies = num_copies2;

    /* Set up the bookkeeping */
    struct copy_ctx _ctx = {.entry_count = num_copies};
    struct copy_ctx *ctx = &_ctx;

    memset(ctx->physreg_dest, 0, sizeof(ctx->physreg_dest));
    memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));

    for (unsigned i = 0; i < ctx->entry_count; i++) {
       struct agx_copy *entry = &copies[i];

       ctx->entries[i] = *entry;

       for (unsigned j = 0; j < agx_size_align_16(entry->src.size); j++) {
          if (is_real(entry))
             ctx->physreg_use_count[entry->src.value + j]++;

          /* Copies should not have overlapping destinations. */
          assert(!ctx->physreg_dest[entry->dest + j]);
          ctx->physreg_dest[entry->dest + j] = &ctx->entries[i];
       }
    }

    /* Try to vectorize aligned 16-bit copies to use 32-bit operations instead */
    for (unsigned i = 0; i < ctx->entry_count; i++) {
       struct agx_copy *entry = &ctx->entries[i];
       if (entry->src.size != AGX_SIZE_16)
          continue;

       if ((entry->dest & 1) || (entry->src.value & 1))
          continue;

       if (entry->src.type != AGX_INDEX_UNIFORM &&
           entry->src.type != AGX_INDEX_REGISTER)
          continue;

       unsigned next_dest = entry->dest + 1;
       assert(next_dest < ARRAY_SIZE(ctx->physreg_dest) && "aligned reg");

       struct agx_copy *next_copy = ctx->physreg_dest[next_dest];
       if (!next_copy)
          continue;

       assert(next_copy->dest == next_dest && "data structure invariant");
       assert(next_copy->src.size == AGX_SIZE_16 && "unaligned copy");

       if (next_copy->src.type != entry->src.type)
          continue;

       if (next_copy->src.value != (entry->src.value + 1))
          continue;

       /* Vectorize the copies */
       ctx->physreg_dest[next_dest] = entry;
       entry->src.size = AGX_SIZE_32;
       next_copy->done = true;
    }

    bool progress = true;
    while (progress) {
       progress = false;

       /* Step 1: resolve paths in the transfer graph. This means finding
        * copies whose destination aren't blocked by something else and then
        * emitting them, continuing this process until every copy is blocked
        * and there are only cycles left.
        *
        * TODO: We should note that src is also available in dest to unblock
        * cycles that src is involved in.
        */

       for (unsigned i = 0; i < ctx->entry_count; i++) {
          struct agx_copy *entry = &ctx->entries[i];
          if (!entry->done && !entry_blocked(entry, ctx)) {
             entry->done = true;
             progress = true;
             do_copy(b, entry);
             for (unsigned j = 0; j < agx_size_align_16(entry->src.size); j++) {
                if (is_real(entry))
                   ctx->physreg_use_count[entry->src.value + j]--;
                ctx->physreg_dest[entry->dest + j] = NULL;
             }
          }
       }

       if (progress)
          continue;

       /* Step 2: Find partially blocked copies and split them. In the
        * mergedregs case, we can 32-bit copies which are only blocked on one
        * 16-bit half, and splitting them helps get things moving.
        *
        * We can skip splitting copies if the source isn't a register,
        * however, because it does not unblock anything and therefore doesn't
        * contribute to making forward progress with step 1. These copies
        * should still be resolved eventually in step 1 because they can't be
        * part of a cycle.
        */
       for (unsigned i = 0; i < ctx->entry_count; i++) {
          struct agx_copy *entry = &ctx->entries[i];
          if (entry->done || (agx_size_align_16(entry->src.size) != 2))
             continue;

          if (((ctx->physreg_use_count[entry->dest] == 0 ||
                ctx->physreg_use_count[entry->dest + 1] == 0)) &&
              is_real(entry)) {
             split_32bit_copy(ctx, entry);
             progress = true;
          }
       }
    }

    /* Step 3: resolve cycles through swapping.
     *
     * At this point, the transfer graph should consist of only cycles.
     * The reason is that, given any physreg n_1 that's the source of a
     * remaining entry, it has a destination n_2, which (because every
     * copy is blocked) is the source of some other copy whose destination
     * is n_3, and so we can follow the chain until we get a cycle. If we
     * reached some other node than n_1:
     *
     *  n_1 -> n_2 -> ... -> n_i
     *          ^             |
     *          |-------------|
     *
     *  then n_2 would be the destination of 2 copies, which is illegal
     *  (checked above in an assert). So n_1 must be part of a cycle:
     *
     *  n_1 -> n_2 -> ... -> n_i
     *  ^                     |
     *  |---------------------|
     *
     *  and this must be only cycle n_1 is involved in, because any other
     *  path starting from n_1 would also have to end in n_1, resulting in
     *  a node somewhere along the way being the destination of 2 copies
     *  when the 2 paths merge.
     *
     *  The way we resolve the cycle is through picking a copy (n_1, n_2)
     *  and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
     *  out of the cycle:
     *
     *  n_1 -> ... -> n_i
     *  ^              |
     *  |--------------|
     *
     *  and we can keep repeating this until the cycle is empty.
     */

    for (unsigned i = 0; i < ctx->entry_count; i++) {
       struct agx_copy *entry = &ctx->entries[i];
       if (entry->done)
          continue;

       assert(is_real(entry));

       /* catch trivial copies */
       if (entry->dest == entry->src.value) {
          entry->done = true;
          continue;
       }

       do_swap(b, entry);

       /* Split any blocking copies whose sources are only partially
        * contained within our destination.
        */
       if (agx_size_align_16(entry->src.size) == 1) {
          for (unsigned j = 0; j < ctx->entry_count; j++) {
             struct agx_copy *blocking = &ctx->entries[j];

             if (blocking->done)
                continue;

             if (blocking->src.value <= entry->dest &&
                 blocking->src.value + 1 >= entry->dest &&
                 agx_size_align_16(blocking->src.size) == 2) {
                split_32bit_copy(ctx, blocking);
             }
          }
       }

       /* Update sources of blocking copies.
        *
        * Note: at this point, every blocking copy's source should be
        * contained within our destination.
        */
       for (unsigned j = 0; j < ctx->entry_count; j++) {
          struct agx_copy *blocking = &ctx->entries[j];
          if (blocking->src.value >= entry->dest &&
              blocking->src.value <
                 entry->dest + agx_size_align_16(entry->src.size)) {
             blocking->src.value =
                entry->src.value + (blocking->src.value - entry->dest);
          }
       }

       entry->done = true;
    }

    free(copies2);
 }

 void
 agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies,
                          unsigned num_copies)
 {
    /* Emit copies fo reach register class separately because we don't have
     * register class awareness in the parallel copy lowering data structure.
     */
    agx_emit_parallel_copies_for_class(b, copies, num_copies, false);
    agx_emit_parallel_copies_for_class(b, copies, num_copies, true);
 }
	/*
	* Copyright 2022 Alyssa Rosenzweig
	* Copyright 2021 Valve Corporation
	* SPDX-License-Identifier: MIT
	*/

	#include "agx_builder.h"
	#include "agx_compiler.h"

	UNUSED static void
	print_copy(const struct agx_copy *cp)
	{
	printf("%sr%u = ", cp->dest_mem ? "m" : "", cp->dest);
	agx_print_index(cp->src, false, stdout);
	printf("\n");
	}

	UNUSED static void
	print_copies(const struct agx_copy *copies, unsigned nr)
	{
	printf("[\n");

	for (unsigned i = 0; i < nr; ++i) {
	printf(" ");
	print_copy(&copies[i]);
	}

	printf("]\n");
	}

	/*
	* Emits code for
	*
	* for (int i = 0; i < n; ++i)
	* registers[dests[i]] = registers[srcs[i]];
	*
	* ...with all copies happening in parallel.
	*
	* That is, emit machine instructions equivalent to a parallel copy. This is
	* used to lower not only parallel copies but also collects and splits, which
	* also have parallel copy semantics.
	*
	* We only handles register-register copies, not general agx_index sources. This
	* suffices for its internal use for register allocation.
	*/

	static void
	do_copy(agx_builder b, const struct agx_copy copy)
	{
	agx_index dst = copy->dest_mem
	? agx_memory_register(copy->dest, copy->src.size)
	: agx_register(copy->dest, copy->src.size);

	if (copy->dest_mem && copy->src.memory) {
	/* Memory-memory copies need to be lowered to memory-register and
	* register-memory, using a reserved scratch register.
	*/
	agx_index scratch_reg = agx_register(2, copy->src.size);
	agx_mov_to(b, scratch_reg, copy->src);
	agx_mov_to(b, dst, scratch_reg);
	} else if (copy->src.type == AGX_INDEX_IMMEDIATE) {
	agx_mov_imm_to(b, dst, copy->src.value);
	} else {
	agx_mov_to(b, dst, copy->src);
	}
	}

	static void
	do_swap(agx_builder b, const struct agx_copy copy)
	{
	assert(copy->src.type == AGX_INDEX_REGISTER && "only GPRs are swapped");

	if (copy->dest == copy->src.value)
	return;

	agx_index x = copy->dest_mem
	? agx_memory_register(copy->dest, copy->src.size)
	: agx_register(copy->dest, copy->src.size);
	agx_index y = copy->src;
	assert(x.memory == y.memory);

	/* Memory-memory swaps lowered here, GPR swaps lowered later */
	if (x.memory) {
	agx_index temp1 = agx_register(4, copy->src.size);
	agx_index temp2 = agx_register(6, copy->src.size);

	agx_mov_to(b, temp1, x);
	agx_mov_to(b, temp2, y);
	agx_mov_to(b, y, temp1);
	agx_mov_to(b, x, temp2);
	} else {
	agx_swap(b, x, y);
	}
	}

	struct copy_ctx {
	/* Number of copies being processed */
	unsigned entry_count;

	/* For each physreg, the number of pending copy entries that use it as a
	* source. Once this drops to zero, then the physreg is unblocked and can
	* be moved to.
	*/
	unsigned physreg_use_count[AGX_NUM_MODELED_REGS];

	/* For each physreg, the pending copy_entry that uses it as a dest. */
	struct agx_copy *physreg_dest[AGX_NUM_MODELED_REGS];

	struct agx_copy entries[AGX_NUM_MODELED_REGS];
	};

	static bool
	entry_blocked(struct agx_copy entry, struct copy_ctx ctx)
	{
	for (unsigned i = 0; i < agx_size_align_16(entry->src.size); i++) {
	if (ctx->physreg_use_count[entry->dest + i] != 0)
	return true;
	}

	return false;
	}

	static bool
	is_real(struct agx_copy *entry)
	{
	return entry->src.type == AGX_INDEX_REGISTER &&
	entry->dest_mem == entry->src.memory;
	}

	/* TODO: Generalize to other bit sizes */
	static void
	split_32bit_copy(struct copy_ctx ctx, struct agx_copy entry)
	{
	assert(!entry->done);
	assert(is_real(entry));
	assert(agx_size_align_16(entry->src.size) == 2);
	struct agx_copy *new_entry = &ctx->entries[ctx->entry_count++];

	new_entry->dest = entry->dest + 1;
	new_entry->dest_mem = entry->dest_mem;
	new_entry->src = entry->src;
	new_entry->src.value += 1;
	new_entry->done = false;
	entry->src.size = AGX_SIZE_16;
	new_entry->src.size = AGX_SIZE_16;
	ctx->physreg_dest[entry->dest + 1] = new_entry;
	}

	static void
	agx_emit_parallel_copies_for_class(agx_builder b, struct agx_copy copies,
	unsigned num_copies, bool cls)
	{
	/* First, lower away 64-bit copies to smaller chunks, since we don't have
	* 64-bit ALU so we always want to split.
	*/
	struct agx_copy copies2 = calloc(sizeof(copies[0]), num_copies 2);
	unsigned num_copies2 = 0;

	for (unsigned i = 0; i < num_copies; ++i) {
	struct agx_copy copy = copies[i];

	/* Filter by class */
	if (copy.dest_mem != cls)
	continue;

	assert(copy.dest < AGX_NUM_MODELED_REGS);

	if (copy.src.size == AGX_SIZE_64) {
	copy.src.size = AGX_SIZE_32;
	copies2[num_copies2++] = copy;

	if (copy.src.type == AGX_INDEX_IMMEDIATE) {
	static_assert(sizeof(copy.src.value) * 8 == 32, "known size");
	copy.src.value = 0;
	} else {
	assert(copy.src.type == AGX_INDEX_REGISTER \|\|
	copy.src.type == AGX_INDEX_UNIFORM);

	copy.src.value += 2;
	}

	copy.dest += 2;
	copies2[num_copies2++] = copy;
	} else {
	copies2[num_copies2++] = copy;
	}
	}

	copies = copies2;
	num_copies = num_copies2;

	/* Set up the bookkeeping */
	struct copy_ctx _ctx = {.entry_count = num_copies};
	struct copy_ctx *ctx = &_ctx;

	memset(ctx->physreg_dest, 0, sizeof(ctx->physreg_dest));
	memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));

	for (unsigned i = 0; i < ctx->entry_count; i++) {
	struct agx_copy *entry = &copies[i];

	ctx->entries[i] = *entry;

	for (unsigned j = 0; j < agx_size_align_16(entry->src.size); j++) {
	if (is_real(entry))
	ctx->physreg_use_count[entry->src.value + j]++;

	/* Copies should not have overlapping destinations. */
	assert(!ctx->physreg_dest[entry->dest + j]);
	ctx->physreg_dest[entry->dest + j] = &ctx->entries[i];
	}
	}

	/* Try to vectorize aligned 16-bit copies to use 32-bit operations instead */
	for (unsigned i = 0; i < ctx->entry_count; i++) {
	struct agx_copy *entry = &ctx->entries[i];
	if (entry->src.size != AGX_SIZE_16)
	continue;

	if ((entry->dest & 1) \|\| (entry->src.value & 1))
	continue;

	if (entry->src.type != AGX_INDEX_UNIFORM &&
	entry->src.type != AGX_INDEX_REGISTER)
	continue;

	unsigned next_dest = entry->dest + 1;
	assert(next_dest < ARRAY_SIZE(ctx->physreg_dest) && "aligned reg");

	struct agx_copy *next_copy = ctx->physreg_dest[next_dest];
	if (!next_copy)
	continue;

	assert(next_copy->dest == next_dest && "data structure invariant");
	assert(next_copy->src.size == AGX_SIZE_16 && "unaligned copy");

	if (next_copy->src.type != entry->src.type)
	continue;

	if (next_copy->src.value != (entry->src.value + 1))
	continue;

	/* Vectorize the copies */
	ctx->physreg_dest[next_dest] = entry;
	entry->src.size = AGX_SIZE_32;
	next_copy->done = true;
	}

	bool progress = true;
	while (progress) {
	progress = false;

	/* Step 1: resolve paths in the transfer graph. This means finding
	* copies whose destination aren't blocked by something else and then
	* emitting them, continuing this process until every copy is blocked
	* and there are only cycles left.
	*
	* TODO: We should note that src is also available in dest to unblock
	* cycles that src is involved in.
	*/

	for (unsigned i = 0; i < ctx->entry_count; i++) {
	struct agx_copy *entry = &ctx->entries[i];
	if (!entry->done && !entry_blocked(entry, ctx)) {
	entry->done = true;
	progress = true;
	do_copy(b, entry);
	for (unsigned j = 0; j < agx_size_align_16(entry->src.size); j++) {
	if (is_real(entry))
	ctx->physreg_use_count[entry->src.value + j]--;
	ctx->physreg_dest[entry->dest + j] = NULL;
	}
	}
	}

	if (progress)
	continue;

	/* Step 2: Find partially blocked copies and split them. In the
	* mergedregs case, we can 32-bit copies which are only blocked on one
	* 16-bit half, and splitting them helps get things moving.
	*
	* We can skip splitting copies if the source isn't a register,
	* however, because it does not unblock anything and therefore doesn't
	* contribute to making forward progress with step 1. These copies
	* should still be resolved eventually in step 1 because they can't be
	* part of a cycle.
	*/
	for (unsigned i = 0; i < ctx->entry_count; i++) {
	struct agx_copy *entry = &ctx->entries[i];
	if (entry->done \|\| (agx_size_align_16(entry->src.size) != 2))
	continue;

	if (((ctx->physreg_use_count[entry->dest] == 0 \|\|
	ctx->physreg_use_count[entry->dest + 1] == 0)) &&
	is_real(entry)) {
	split_32bit_copy(ctx, entry);
	progress = true;
	}
	}
	}

	/* Step 3: resolve cycles through swapping.
	*
	* At this point, the transfer graph should consist of only cycles.
	* The reason is that, given any physreg n_1 that's the source of a
	* remaining entry, it has a destination n_2, which (because every
	* copy is blocked) is the source of some other copy whose destination
	* is n_3, and so we can follow the chain until we get a cycle. If we
	* reached some other node than n_1:
	*
	* n_1 -> n_2 -> ... -> n_i
	* ^ \|
	* \|-------------\|
	*
	* then n_2 would be the destination of 2 copies, which is illegal
	* (checked above in an assert). So n_1 must be part of a cycle:
	*
	* n_1 -> n_2 -> ... -> n_i
	* ^ \|
	* \|---------------------\|
	*
	* and this must be only cycle n_1 is involved in, because any other
	* path starting from n_1 would also have to end in n_1, resulting in
	* a node somewhere along the way being the destination of 2 copies
	* when the 2 paths merge.
	*
	* The way we resolve the cycle is through picking a copy (n_1, n_2)
	* and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
	* out of the cycle:
	*
	* n_1 -> ... -> n_i
	* ^ \|
	* \|--------------\|
	*
	* and we can keep repeating this until the cycle is empty.
	*/

	for (unsigned i = 0; i < ctx->entry_count; i++) {
	struct agx_copy *entry = &ctx->entries[i];
	if (entry->done)
	continue;

	assert(is_real(entry));

	/* catch trivial copies */
	if (entry->dest == entry->src.value) {
	entry->done = true;
	continue;
	}

	do_swap(b, entry);

	/* Split any blocking copies whose sources are only partially
	* contained within our destination.
	*/
	if (agx_size_align_16(entry->src.size) == 1) {
	for (unsigned j = 0; j < ctx->entry_count; j++) {
	struct agx_copy *blocking = &ctx->entries[j];

	if (blocking->done)
	continue;

	if (blocking->src.value <= entry->dest &&
	blocking->src.value + 1 >= entry->dest &&
	agx_size_align_16(blocking->src.size) == 2) {
	split_32bit_copy(ctx, blocking);
	}
	}
	}

	/* Update sources of blocking copies.
	*
	* Note: at this point, every blocking copy's source should be
	* contained within our destination.
	*/
	for (unsigned j = 0; j < ctx->entry_count; j++) {
	struct agx_copy *blocking = &ctx->entries[j];
	if (blocking->src.value >= entry->dest &&
	blocking->src.value <
	entry->dest + agx_size_align_16(entry->src.size)) {
	blocking->src.value =
	entry->src.value + (blocking->src.value - entry->dest);
	}
	}

	entry->done = true;
	}

	free(copies2);
	}

	void
	agx_emit_parallel_copies(agx_builder b, struct agx_copy copies,
	unsigned num_copies)
	{
	/* Emit copies fo reach register class separately because we don't have
	* register class awareness in the parallel copy lowering data structure.
	*/
	agx_emit_parallel_copies_for_class(b, copies, num_copies, false);
	agx_emit_parallel_copies_for_class(b, copies, num_copies, true);
	}