src/asahi/lib/agx_scratch.c - third_party/mesa - Git at Google

 /*
  * Copyright 2023 Asahi Lina
  * SPDX-License-Identifier: MIT
  */

 #include "agx_scratch.h"
 #include "asahi/compiler/agx_compile.h"
 #include "shaders/helper.h"
 #include "util/u_hexdump.h"
 #include "agx_bo.h"
 #include "libagx_shaders.h"
 #include "nir.h"
 #include "nir_builder_opcodes.h"

 #define AGX_ADDR_SHIFT        8
 #define AGX_THREADS_PER_GROUP 32
 #define AGX_SPILL_UNIT_DWORDS 8

 // FIXME: What is the actual value here? Seems to be 96 + 8 or so?
 #define AGX_MAX_SUBGROUPS_PER_CORE 128

 // Unknown if this goes higher.
 #define AGX_MAX_SCRATCH_BLOCK_LOG4 6
 #define AGX_MAX_SCRATCH_DWORDS                                                 \
    ((AGX_SPILL_UNIT_DWORDS << (2 * AGX_MAX_SCRATCH_BLOCK_LOG4)) * 4)

 struct spill_size {
    uint32_t log4_bsize;
    uint32_t count;
 };

 struct agx_bo *
 agx_build_helper(struct agx_device *dev)
 {
    struct agx_bo *bo = agx_bo_create(
       dev, sizeof(libagx_g13_helper), 0,
       AGX_BO_READONLY | AGX_BO_EXEC | AGX_BO_LOW_VA, "Helper shader");
    assert(bo);
    memcpy(bo->map, libagx_g13_helper, sizeof(libagx_g13_helper));

    if (dev->debug & AGX_DBG_SCRATCH)
       fprintf(stderr, "Helper: 0x%" PRIx64 "\n", bo->va->addr);

    return bo;
 }

 static struct spill_size
 agx_scratch_get_spill_size(unsigned dwords)
 {
    if (!dwords) {
       return (struct spill_size){0, 0};
    }
    assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large");

    unsigned log4 =
       util_logbase2(DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS)) / 2;
    unsigned blocks = DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS << (2 * log4));
    if (log4 > AGX_MAX_SCRATCH_BLOCK_LOG4) {
       // Max size case (4 blocks)
       assert(log4 == (AGX_MAX_SCRATCH_BLOCK_LOG4 + 1));
       log4--;
       blocks = 4;
    } else if (blocks == 4) {
       // Non max size 4 block case, shift to next log4 unit for consistency.
       log4++;
       blocks = 1;
    }

    return (struct spill_size){log4, blocks};
 }

 unsigned
 agx_scratch_get_bucket(uint32_t dwords)
 {
    /* For debugging/analysis purposes, scratch allocation sizes are
     * divided into buckets. Since we only allocate a single global
     * worst-case scratch buffer, these buckets do not have any meaning
     * for the actual allocation mechanism. They are only used to log
     * allocation sizes. We just use a simple log2 of the size here.
     */

    if (!dwords)
       return 0;
    assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large");

    return MIN2(
       AGX_SPILL_SIZE_BUCKETS - 1,
       1 + util_logbase2_ceil(DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS)));
 }

 static void
 agx_scratch_realloc(struct agx_scratch *scratch)
 {
    if (scratch->buf)
       agx_bo_unreference(scratch->dev, scratch->buf);

    struct spill_size size = agx_scratch_get_spill_size(scratch->size_dwords);

    if (scratch->dev->debug & AGX_DBG_SCRATCH)
       fprintf(stderr, "Scratch realloc: %d (%d:%d) x %d\n",
               scratch->size_dwords, size.log4_bsize, size.count,
               scratch->subgroups);

    unsigned block_dwords = AGX_SPILL_UNIT_DWORDS << (2 * size.log4_bsize);
    size_t block_size_bytes = (AGX_THREADS_PER_GROUP * 4) * block_dwords;
    scratch->size_dwords = block_dwords * size.count;

    if (scratch->dev->debug & AGX_DBG_SCRATCH)
       fprintf(stderr, "Block size: 0x%zx bytes (%d)\n", block_size_bytes,
               size.log4_bsize);

    unsigned block_count = size.count;

    if (scratch->dev->debug & AGX_DBG_SCRATCH)
       fprintf(stderr, "Block count: %d\n", block_count);

    size_t core_alloc = block_size_bytes * block_count * scratch->subgroups;

    size_t header_size = sizeof(struct agx_helper_header);

    size_t blocklist_off = header_size;
    size_t blocklist_core_size =
       scratch->subgroups * sizeof(struct agx_helper_block);
    size_t blocklist_size = blocklist_core_size * scratch->num_cores;

    size_t blocks_off = align(header_size + blocklist_size, block_size_bytes);
    size_t total_alloc = blocks_off + core_alloc * scratch->num_cores;

    unsigned flags = 0;
 #ifdef SCRATCH_DEBUG
    flags = AGX_BO_WRITEBACK;
 #endif
    scratch->buf = agx_bo_create(scratch->dev, total_alloc, block_size_bytes,
                                 flags, "Scratch");
    memset(scratch->buf->map, 0, blocks_off);

    struct agx_helper_header *hdr = scratch->buf->map;
    scratch->header = hdr;

    uint64_t blocklist_gpu = scratch->buf->va->addr + blocklist_off;
    struct agx_helper_block *blocklist_cpu = scratch->buf->map + blocklist_off;

 #ifdef SCRATCH_DEBUG
    scratch->blocklist = blocklist_cpu;
    scratch->data = scratch->buf->map + blocks_off;
    scratch->core_size = block_size_bytes * block_count * scratch->subgroups;
 #endif

    uint64_t blocks_gpu = scratch->buf->va->addr + blocks_off;

    hdr->subgroups = scratch->subgroups;

    unsigned num_cores = 0;
    unsigned core_id;
    for (core_id = 0; core_id < AGX_MAX_CORE_ID; core_id++) {
 #ifndef SCRATCH_DEBUG_CORES
       unsigned cores_per_cluster =
          util_next_power_of_two(scratch->dev->params.num_cores_per_cluster);
       unsigned cluster = core_id / cores_per_cluster;
       unsigned core = core_id % cores_per_cluster;
       if (cluster >= scratch->dev->params.num_clusters_total)
          break;
       if (core >= scratch->dev->params.num_cores_per_cluster ||
           !(scratch->dev->params.core_masks[cluster] & BITFIELD_BIT(core)))
          continue;
 #endif
       num_cores++;
 #ifdef SCRATCH_DEBUG
       scratch->core_present[core_id] = true;
 #endif

       hdr->cores[core_id].blocklist = blocklist_gpu;

       for (unsigned sg = 0; sg < scratch->subgroups; sg++) {
          uint32_t mask = BITFIELD_MASK(size.log4_bsize + 1);
          assert(!(blocks_gpu & (block_size_bytes - 1)));

          uint32_t base = blocks_gpu >> AGX_ADDR_SHIFT;
          uint32_t stride = block_size_bytes >> AGX_ADDR_SHIFT;
          blocklist_cpu[sg].blocks[0] = mask | base;
          for (int block = 1; block <= 3; block++) {
             if (block_count >= (block + 1))
                blocklist_cpu[sg].blocks[block] = 1 | (base + block * stride);
             else
                blocklist_cpu[sg].blocks[block] = 0;
          }

          blocks_gpu += block_size_bytes * block_count;
       }

       blocklist_gpu += sizeof(struct agx_helper_block) * scratch->subgroups;
       blocklist_cpu += scratch->subgroups;
    }
    scratch->max_core_id = core_id;
    assert(num_cores == scratch->num_cores);

    if (scratch->dev->debug & AGX_DBG_SCRATCH)
       fprintf(stderr, "New Scratch @ 0x%" PRIx64 " (size: 0x%zx)\n",
               scratch->buf->va->addr, scratch->buf->size);
 }

 void
 agx_scratch_alloc(struct agx_scratch *scratch, unsigned dwords,
                   size_t subgroups)
 {
    bool realloc = false;

    if (!dwords)
       return;

    assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large");

    if (!subgroups)
       subgroups = AGX_MAX_SUBGROUPS_PER_CORE;

    subgroups = MIN2(AGX_MAX_SUBGROUPS_PER_CORE, subgroups);

    if (dwords > scratch->size_dwords) {
       scratch->size_dwords = dwords;
       realloc = true;
    }

    if (subgroups > scratch->subgroups) {
       scratch->subgroups = subgroups;
       realloc = true;
    }

    if (realloc) {
       agx_scratch_realloc(scratch);
    }
 }

 void
 agx_scratch_debug_pre(struct agx_scratch *scratch)
 {
    if (!scratch->buf)
       return;

    for (int core = 0; core < scratch->max_core_id; core++) {
       assert(!scratch->header->cores[core].alloc_cur);
       scratch->header->cores[core].alloc_max = 0;
       scratch->header->cores[core].alloc_failed = 0;
       memset(scratch->header->cores[core].alloc_count, 0,
              sizeof(scratch->header->cores[core].alloc_count));
    }
 }

 void
 agx_scratch_debug_post(struct agx_scratch *scratch)
 {
    if (!scratch->buf)
       return;

    fprintf(stderr, "Scratch @ 0x%" PRIx64 "\n", scratch->buf->va->addr);

    for (int core = 0; core < scratch->max_core_id; core++) {
       fprintf(stderr, "Core %3d: max %d, failed %d, counts:", core,
               scratch->header->cores[core].alloc_max,
               scratch->header->cores[core].alloc_failed);

       for (unsigned bucket = 0; bucket < AGX_SPILL_SIZE_BUCKETS; bucket++) {
          fprintf(stderr, " %d:%-3d",
                  bucket ? (AGX_SPILL_UNIT_DWORDS << (bucket - 1)) : 0,
                  scratch->header->cores[core].alloc_count[bucket]);
       }
       fprintf(stderr, "\n");
       assert(!scratch->header->cores[core].alloc_cur);
       assert(!scratch->header->cores[core].alloc_failed);
    }

 #ifdef SCRATCH_DEBUG
    unsigned core_index = 0;
    for (int core = 0; core < scratch->max_core_id; core++) {
       if (!scratch->core_present[core])
          continue;
       void *p = scratch->data + scratch->core_size * core_index++;
       fprintf(stderr, "\nCORE %d (0x%lx)\n", core, scratch->core_size);
       u_hexdump(stderr, p, scratch->core_size, true);
    }
 #endif
 }

 void
 agx_scratch_init(struct agx_device *dev, struct agx_scratch *scratch)
 {
    memset(scratch, 0, sizeof(*scratch));

    scratch->dev = dev;
 #ifdef SCRATCH_DEBUG_CORES
    scratch->num_cores = SCRATCH_DEBUG_CORES;
 #else
    scratch->num_cores = 0;
    for (unsigned cl = 0; cl < dev->params.num_clusters_total; cl++) {
       scratch->num_cores += util_bitcount(dev->params.core_masks[cl]);
    }
 #endif
 }

 void
 agx_scratch_fini(struct agx_scratch *scratch)
 {
    if (scratch->buf)
       agx_bo_unreference(scratch->dev, scratch->buf);
    scratch->buf = NULL;
 }
	/*
	* Copyright 2023 Asahi Lina
	* SPDX-License-Identifier: MIT
	*/

	#include "agx_scratch.h"
	#include "asahi/compiler/agx_compile.h"
	#include "shaders/helper.h"
	#include "util/u_hexdump.h"
	#include "agx_bo.h"
	#include "libagx_shaders.h"
	#include "nir.h"
	#include "nir_builder_opcodes.h"

	#define AGX_ADDR_SHIFT 8
	#define AGX_THREADS_PER_GROUP 32
	#define AGX_SPILL_UNIT_DWORDS 8

	// FIXME: What is the actual value here? Seems to be 96 + 8 or so?
	#define AGX_MAX_SUBGROUPS_PER_CORE 128

	// Unknown if this goes higher.
	#define AGX_MAX_SCRATCH_BLOCK_LOG4 6
	#define AGX_MAX_SCRATCH_DWORDS \
	((AGX_SPILL_UNIT_DWORDS << (2 * AGX_MAX_SCRATCH_BLOCK_LOG4)) * 4)

	struct spill_size {
	uint32_t log4_bsize;
	uint32_t count;
	};

	struct agx_bo *
	agx_build_helper(struct agx_device *dev)
	{
	struct agx_bo *bo = agx_bo_create(
	dev, sizeof(libagx_g13_helper), 0,
	AGX_BO_READONLY \| AGX_BO_EXEC \| AGX_BO_LOW_VA, "Helper shader");
	assert(bo);
	memcpy(bo->map, libagx_g13_helper, sizeof(libagx_g13_helper));

	if (dev->debug & AGX_DBG_SCRATCH)
	fprintf(stderr, "Helper: 0x%" PRIx64 "\n", bo->va->addr);

	return bo;
	}

	static struct spill_size
	agx_scratch_get_spill_size(unsigned dwords)
	{
	if (!dwords) {
	return (struct spill_size){0, 0};
	}
	assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large");

	unsigned log4 =
	util_logbase2(DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS)) / 2;
	unsigned blocks = DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS << (2 * log4));
	if (log4 > AGX_MAX_SCRATCH_BLOCK_LOG4) {
	// Max size case (4 blocks)
	assert(log4 == (AGX_MAX_SCRATCH_BLOCK_LOG4 + 1));
	log4--;
	blocks = 4;
	} else if (blocks == 4) {
	// Non max size 4 block case, shift to next log4 unit for consistency.
	log4++;
	blocks = 1;
	}

	return (struct spill_size){log4, blocks};
	}

	unsigned
	agx_scratch_get_bucket(uint32_t dwords)
	{
	/* For debugging/analysis purposes, scratch allocation sizes are
	* divided into buckets. Since we only allocate a single global
	* worst-case scratch buffer, these buckets do not have any meaning
	* for the actual allocation mechanism. They are only used to log
	* allocation sizes. We just use a simple log2 of the size here.
	*/

	if (!dwords)
	return 0;
	assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large");

	return MIN2(
	AGX_SPILL_SIZE_BUCKETS - 1,
	1 + util_logbase2_ceil(DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS)));
	}

	static void
	agx_scratch_realloc(struct agx_scratch *scratch)
	{
	if (scratch->buf)
	agx_bo_unreference(scratch->dev, scratch->buf);

	struct spill_size size = agx_scratch_get_spill_size(scratch->size_dwords);

	if (scratch->dev->debug & AGX_DBG_SCRATCH)
	fprintf(stderr, "Scratch realloc: %d (%d:%d) x %d\n",
	scratch->size_dwords, size.log4_bsize, size.count,
	scratch->subgroups);

	unsigned block_dwords = AGX_SPILL_UNIT_DWORDS << (2 * size.log4_bsize);
	size_t block_size_bytes = (AGX_THREADS_PER_GROUP * 4) * block_dwords;
	scratch->size_dwords = block_dwords * size.count;

	if (scratch->dev->debug & AGX_DBG_SCRATCH)
	fprintf(stderr, "Block size: 0x%zx bytes (%d)\n", block_size_bytes,
	size.log4_bsize);

	unsigned block_count = size.count;

	if (scratch->dev->debug & AGX_DBG_SCRATCH)
	fprintf(stderr, "Block count: %d\n", block_count);

	size_t core_alloc = block_size_bytes * block_count * scratch->subgroups;

	size_t header_size = sizeof(struct agx_helper_header);

	size_t blocklist_off = header_size;
	size_t blocklist_core_size =
	scratch->subgroups * sizeof(struct agx_helper_block);
	size_t blocklist_size = blocklist_core_size * scratch->num_cores;

	size_t blocks_off = align(header_size + blocklist_size, block_size_bytes);
	size_t total_alloc = blocks_off + core_alloc * scratch->num_cores;

	unsigned flags = 0;
	#ifdef SCRATCH_DEBUG
	flags = AGX_BO_WRITEBACK;
	#endif
	scratch->buf = agx_bo_create(scratch->dev, total_alloc, block_size_bytes,
	flags, "Scratch");
	memset(scratch->buf->map, 0, blocks_off);

	struct agx_helper_header *hdr = scratch->buf->map;
	scratch->header = hdr;

	uint64_t blocklist_gpu = scratch->buf->va->addr + blocklist_off;
	struct agx_helper_block *blocklist_cpu = scratch->buf->map + blocklist_off;

	#ifdef SCRATCH_DEBUG
	scratch->blocklist = blocklist_cpu;
	scratch->data = scratch->buf->map + blocks_off;
	scratch->core_size = block_size_bytes * block_count * scratch->subgroups;
	#endif

	uint64_t blocks_gpu = scratch->buf->va->addr + blocks_off;

	hdr->subgroups = scratch->subgroups;

	unsigned num_cores = 0;
	unsigned core_id;
	for (core_id = 0; core_id < AGX_MAX_CORE_ID; core_id++) {
	#ifndef SCRATCH_DEBUG_CORES
	unsigned cores_per_cluster =
	util_next_power_of_two(scratch->dev->params.num_cores_per_cluster);
	unsigned cluster = core_id / cores_per_cluster;
	unsigned core = core_id % cores_per_cluster;
	if (cluster >= scratch->dev->params.num_clusters_total)
	break;
	if (core >= scratch->dev->params.num_cores_per_cluster \|\|
	!(scratch->dev->params.core_masks[cluster] & BITFIELD_BIT(core)))
	continue;
	#endif
	num_cores++;
	#ifdef SCRATCH_DEBUG
	scratch->core_present[core_id] = true;
	#endif

	hdr->cores[core_id].blocklist = blocklist_gpu;

	for (unsigned sg = 0; sg < scratch->subgroups; sg++) {
	uint32_t mask = BITFIELD_MASK(size.log4_bsize + 1);
	assert(!(blocks_gpu & (block_size_bytes - 1)));

	uint32_t base = blocks_gpu >> AGX_ADDR_SHIFT;
	uint32_t stride = block_size_bytes >> AGX_ADDR_SHIFT;
	blocklist_cpu[sg].blocks[0] = mask \| base;
	for (int block = 1; block <= 3; block++) {
	if (block_count >= (block + 1))
	blocklist_cpu[sg].blocks[block] = 1 \| (base + block * stride);
	else
	blocklist_cpu[sg].blocks[block] = 0;
	}

	blocks_gpu += block_size_bytes * block_count;
	}

	blocklist_gpu += sizeof(struct agx_helper_block) * scratch->subgroups;
	blocklist_cpu += scratch->subgroups;
	}
	scratch->max_core_id = core_id;
	assert(num_cores == scratch->num_cores);

	if (scratch->dev->debug & AGX_DBG_SCRATCH)
	fprintf(stderr, "New Scratch @ 0x%" PRIx64 " (size: 0x%zx)\n",
	scratch->buf->va->addr, scratch->buf->size);
	}

	void
	agx_scratch_alloc(struct agx_scratch *scratch, unsigned dwords,
	size_t subgroups)
	{
	bool realloc = false;

	if (!dwords)
	return;

	assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large");

	if (!subgroups)
	subgroups = AGX_MAX_SUBGROUPS_PER_CORE;

	subgroups = MIN2(AGX_MAX_SUBGROUPS_PER_CORE, subgroups);

	if (dwords > scratch->size_dwords) {
	scratch->size_dwords = dwords;
	realloc = true;
	}

	if (subgroups > scratch->subgroups) {
	scratch->subgroups = subgroups;
	realloc = true;
	}

	if (realloc) {
	agx_scratch_realloc(scratch);
	}
	}

	void
	agx_scratch_debug_pre(struct agx_scratch *scratch)
	{
	if (!scratch->buf)
	return;

	for (int core = 0; core < scratch->max_core_id; core++) {
	assert(!scratch->header->cores[core].alloc_cur);
	scratch->header->cores[core].alloc_max = 0;
	scratch->header->cores[core].alloc_failed = 0;
	memset(scratch->header->cores[core].alloc_count, 0,
	sizeof(scratch->header->cores[core].alloc_count));
	}
	}

	void
	agx_scratch_debug_post(struct agx_scratch *scratch)
	{
	if (!scratch->buf)
	return;

	fprintf(stderr, "Scratch @ 0x%" PRIx64 "\n", scratch->buf->va->addr);

	for (int core = 0; core < scratch->max_core_id; core++) {
	fprintf(stderr, "Core %3d: max %d, failed %d, counts:", core,
	scratch->header->cores[core].alloc_max,
	scratch->header->cores[core].alloc_failed);

	for (unsigned bucket = 0; bucket < AGX_SPILL_SIZE_BUCKETS; bucket++) {
	fprintf(stderr, " %d:%-3d",
	bucket ? (AGX_SPILL_UNIT_DWORDS << (bucket - 1)) : 0,
	scratch->header->cores[core].alloc_count[bucket]);
	}
	fprintf(stderr, "\n");
	assert(!scratch->header->cores[core].alloc_cur);
	assert(!scratch->header->cores[core].alloc_failed);
	}

	#ifdef SCRATCH_DEBUG
	unsigned core_index = 0;
	for (int core = 0; core < scratch->max_core_id; core++) {
	if (!scratch->core_present[core])
	continue;
	void p = scratch->data + scratch->core_size core_index++;
	fprintf(stderr, "\nCORE %d (0x%lx)\n", core, scratch->core_size);
	u_hexdump(stderr, p, scratch->core_size, true);
	}
	#endif
	}

	void
	agx_scratch_init(struct agx_device dev, struct agx_scratch scratch)
	{
	memset(scratch, 0, sizeof(*scratch));

	scratch->dev = dev;
	#ifdef SCRATCH_DEBUG_CORES
	scratch->num_cores = SCRATCH_DEBUG_CORES;
	#else
	scratch->num_cores = 0;
	for (unsigned cl = 0; cl < dev->params.num_clusters_total; cl++) {
	scratch->num_cores += util_bitcount(dev->params.core_masks[cl]);
	}
	#endif
	}

	void
	agx_scratch_fini(struct agx_scratch *scratch)
	{
	if (scratch->buf)
	agx_bo_unreference(scratch->dev, scratch->buf);
	scratch->buf = NULL;
	}