src/asahi/lib/agx_tilebuffer.c - third_party/mesa - Git at Google

 /*
  * Copyright 2022 Alyssa Rosenzweig
  * SPDX-License-Identifier: MIT
  */

 #include "agx_tilebuffer.h"
 #include <assert.h>
 #include "util/bitscan.h"
 #include "util/format/u_format.h"
 #include "agx_usc.h"
 #include "layout.h"

 /* Maximum number of bytes per tile on G13G. This may change in future versions
  * of the architecture.
  */
 #define MAX_BYTES_PER_TILE (32768 - 1)

 /* Maximum bytes per sample in the tilebuffer. Greater allocations require
  * spilling render targets to memory.
  */
 #define MAX_BYTES_PER_SAMPLE (64)

 /* Minimum tile size in pixels, architectural. */
 #define MIN_TILE_SIZE_PX (16 * 16)

 /* Select the largest tile size that fits */
 static struct agx_tile_size
 agx_select_tile_size(unsigned bytes_per_pixel)
 {
    /* clang-format off */
    struct agx_tile_size sizes[] = {
       { 32, 32 },
       { 32, 16 },
       { 16, 16 }
    };
    /* clang-format on */

    for (unsigned i = 0; i < ARRAY_SIZE(sizes); ++i) {
       struct agx_tile_size size = sizes[i];

       if ((bytes_per_pixel * size.width * size.height) <= MAX_BYTES_PER_TILE)
          return size;
    }

    unreachable("No supported tile size meets the bytes per pixel requirement");
 }

 static unsigned
 agx_shared_layout_from_tile_size(struct agx_tile_size t)
 {
    if (t.width == 32 && t.height == 32)
       return AGX_SHARED_LAYOUT_32X32;
    else if (t.width == 32 && t.height == 16)
       return AGX_SHARED_LAYOUT_32X16;
    else if (t.width == 16 && t.height == 16)
       return AGX_SHARED_LAYOUT_16X16;
    else
       unreachable("Invalid tile size");
 }

 struct agx_tilebuffer_layout
 agx_build_tilebuffer_layout(const enum pipe_format *formats, uint8_t nr_cbufs,
                             uint8_t nr_samples, bool layered)
 {
    struct agx_tilebuffer_layout tib = {
       .nr_samples = nr_samples,
       .layered = layered,
    };

    uint32_t offset_B = 0;

    for (unsigned rt = 0; rt < nr_cbufs; ++rt) {
       tib.logical_format[rt] = formats[rt];

       /* If there are gaps in the layout, don't allocate holes. Obscure,
        * PIPE_FORMAT_NONE has a size of 1, not 0.
        */
       if (formats[rt] == PIPE_FORMAT_NONE)
          continue;

       /* Require natural alignment for tilebuffer allocations. This could be
        * optimized, but this shouldn't be a problem in practice.
        */
       enum pipe_format physical_fmt = agx_tilebuffer_physical_format(&tib, rt);
       unsigned align_B = util_format_get_blocksize(physical_fmt);
       assert(util_is_power_of_two_nonzero(align_B) &&
              util_is_power_of_two_nonzero(MAX_BYTES_PER_SAMPLE) &&
              align_B < MAX_BYTES_PER_SAMPLE &&
              "max bytes per sample divisible by alignment");

       offset_B = ALIGN_POT(offset_B, align_B);
       assert(offset_B <= MAX_BYTES_PER_SAMPLE && "loop invariant + above");

       /* Determine the size, if we were to allocate this render target to the
        * tilebuffer as desired.
        */
       unsigned nr = util_format_get_nr_components(physical_fmt) == 1
                        ? util_format_get_nr_components(formats[rt])
                        : 1;

       unsigned size_B = align_B * nr;
       unsigned new_offset_B = offset_B + size_B;

       /* If allocating this render target would exceed any tilebuffer limits, we
        * need to spill it to memory. We continue processing in case there are
        * smaller render targets after that would still fit. Otherwise, we
        * allocate it to the tilebuffer.
        *
        * TODO: Suboptimal, we might be able to reorder render targets to
        * avoid fragmentation causing spilling.
        */
       bool fits = (new_offset_B <= MAX_BYTES_PER_SAMPLE) &&
                   (ALIGN_POT(new_offset_B, 8) * MIN_TILE_SIZE_PX *
                    nr_samples) <= MAX_BYTES_PER_TILE;

       if (fits) {
          tib._offset_B[rt] = offset_B;
          offset_B = new_offset_B;
       } else {
          tib.spilled[rt] = true;
       }
    }

    assert(offset_B <= MAX_BYTES_PER_SAMPLE && "loop invariant");

    /* Multisampling needs a nonempty allocation.
     * XXX: Check this against hw
     */
    if (nr_samples > 1)
       offset_B = MAX2(offset_B, 1);

    tib.sample_size_B = ALIGN_POT(offset_B, 8);

    tib.tile_size = agx_select_tile_size(tib.sample_size_B * nr_samples);

    agx_tilebuffer_pack_usc(&tib);
    return tib;
 }

 enum pipe_format
 agx_tilebuffer_physical_format(struct agx_tilebuffer_layout *tib, unsigned rt)
 {
    return ail_pixel_format[tib->logical_format[rt]].renderable;
 }

 bool
 agx_tilebuffer_supports_mask(struct agx_tilebuffer_layout *tib, unsigned rt)
 {
    /* We don't bother support masking with spilled render targets. This might be
     * optimized in the future but spilling is so rare anyway it's not worth it.
     */
    if (tib->spilled[rt])
       return false;

    enum pipe_format fmt = agx_tilebuffer_physical_format(tib, rt);
    return ail_isa_format_supports_mask((enum ail_isa_format)fmt);
 }

 uint32_t
 agx_tilebuffer_total_size(struct agx_tilebuffer_layout *tib)
 {
    return tib->sample_size_B * tib->nr_samples * tib->tile_size.width *
           tib->tile_size.height;
 }

 void
 agx_tilebuffer_pack_usc(struct agx_tilebuffer_layout *tib)
 {
    agx_pack(&tib->usc, USC_SHARED, cfg) {
       if (tib->nr_samples > 0) {
          cfg.uses_shared_memory = true;
          cfg.layout = agx_shared_layout_from_tile_size(tib->tile_size);
          cfg.sample_stride_in_8_bytes = tib->sample_size_B / 8;
          cfg.sample_count = tib->nr_samples;
          cfg.bytes_per_threadgroup = agx_tilebuffer_total_size(tib);
       } else {
          cfg.layout = AGX_SHARED_LAYOUT_VERTEX_COMPUTE;
          cfg.bytes_per_threadgroup = 65536;
       }
    }
 }
	/*
	* Copyright 2022 Alyssa Rosenzweig
	* SPDX-License-Identifier: MIT
	*/

	#include "agx_tilebuffer.h"
	#include <assert.h>
	#include "util/bitscan.h"
	#include "util/format/u_format.h"
	#include "agx_usc.h"
	#include "layout.h"

	/* Maximum number of bytes per tile on G13G. This may change in future versions
	* of the architecture.
	*/
	#define MAX_BYTES_PER_TILE (32768 - 1)

	/* Maximum bytes per sample in the tilebuffer. Greater allocations require
	* spilling render targets to memory.
	*/
	#define MAX_BYTES_PER_SAMPLE (64)

	/* Minimum tile size in pixels, architectural. */
	#define MIN_TILE_SIZE_PX (16 * 16)

	/* Select the largest tile size that fits */
	static struct agx_tile_size
	agx_select_tile_size(unsigned bytes_per_pixel)
	{
	/* clang-format off */
	struct agx_tile_size sizes[] = {
	{ 32, 32 },
	{ 32, 16 },
	{ 16, 16 }
	};
	/* clang-format on */

	for (unsigned i = 0; i < ARRAY_SIZE(sizes); ++i) {
	struct agx_tile_size size = sizes[i];

	if ((bytes_per_pixel * size.width * size.height) <= MAX_BYTES_PER_TILE)
	return size;
	}

	unreachable("No supported tile size meets the bytes per pixel requirement");
	}

	static unsigned
	agx_shared_layout_from_tile_size(struct agx_tile_size t)
	{
	if (t.width == 32 && t.height == 32)
	return AGX_SHARED_LAYOUT_32X32;
	else if (t.width == 32 && t.height == 16)
	return AGX_SHARED_LAYOUT_32X16;
	else if (t.width == 16 && t.height == 16)
	return AGX_SHARED_LAYOUT_16X16;
	else
	unreachable("Invalid tile size");
	}

	struct agx_tilebuffer_layout
	agx_build_tilebuffer_layout(const enum pipe_format *formats, uint8_t nr_cbufs,
	uint8_t nr_samples, bool layered)
	{
	struct agx_tilebuffer_layout tib = {
	.nr_samples = nr_samples,
	.layered = layered,
	};

	uint32_t offset_B = 0;

	for (unsigned rt = 0; rt < nr_cbufs; ++rt) {
	tib.logical_format[rt] = formats[rt];

	/* If there are gaps in the layout, don't allocate holes. Obscure,
	* PIPE_FORMAT_NONE has a size of 1, not 0.
	*/
	if (formats[rt] == PIPE_FORMAT_NONE)
	continue;

	/* Require natural alignment for tilebuffer allocations. This could be
	* optimized, but this shouldn't be a problem in practice.
	*/
	enum pipe_format physical_fmt = agx_tilebuffer_physical_format(&tib, rt);
	unsigned align_B = util_format_get_blocksize(physical_fmt);
	assert(util_is_power_of_two_nonzero(align_B) &&
	util_is_power_of_two_nonzero(MAX_BYTES_PER_SAMPLE) &&
	align_B < MAX_BYTES_PER_SAMPLE &&
	"max bytes per sample divisible by alignment");

	offset_B = ALIGN_POT(offset_B, align_B);
	assert(offset_B <= MAX_BYTES_PER_SAMPLE && "loop invariant + above");

	/* Determine the size, if we were to allocate this render target to the
	* tilebuffer as desired.
	*/
	unsigned nr = util_format_get_nr_components(physical_fmt) == 1
	? util_format_get_nr_components(formats[rt])
	: 1;

	unsigned size_B = align_B * nr;
	unsigned new_offset_B = offset_B + size_B;

	/* If allocating this render target would exceed any tilebuffer limits, we
	* need to spill it to memory. We continue processing in case there are
	* smaller render targets after that would still fit. Otherwise, we
	* allocate it to the tilebuffer.
	*
	* TODO: Suboptimal, we might be able to reorder render targets to
	* avoid fragmentation causing spilling.
	*/
	bool fits = (new_offset_B <= MAX_BYTES_PER_SAMPLE) &&
	(ALIGN_POT(new_offset_B, 8) * MIN_TILE_SIZE_PX *
	nr_samples) <= MAX_BYTES_PER_TILE;

	if (fits) {
	tib._offset_B[rt] = offset_B;
	offset_B = new_offset_B;
	} else {
	tib.spilled[rt] = true;
	}
	}

	assert(offset_B <= MAX_BYTES_PER_SAMPLE && "loop invariant");

	/* Multisampling needs a nonempty allocation.
	* XXX: Check this against hw
	*/
	if (nr_samples > 1)
	offset_B = MAX2(offset_B, 1);

	tib.sample_size_B = ALIGN_POT(offset_B, 8);

	tib.tile_size = agx_select_tile_size(tib.sample_size_B * nr_samples);

	agx_tilebuffer_pack_usc(&tib);
	return tib;
	}

	enum pipe_format
	agx_tilebuffer_physical_format(struct agx_tilebuffer_layout *tib, unsigned rt)
	{
	return ail_pixel_format[tib->logical_format[rt]].renderable;
	}

	bool
	agx_tilebuffer_supports_mask(struct agx_tilebuffer_layout *tib, unsigned rt)
	{
	/* We don't bother support masking with spilled render targets. This might be
	* optimized in the future but spilling is so rare anyway it's not worth it.
	*/
	if (tib->spilled[rt])
	return false;

	enum pipe_format fmt = agx_tilebuffer_physical_format(tib, rt);
	return ail_isa_format_supports_mask((enum ail_isa_format)fmt);
	}

	uint32_t
	agx_tilebuffer_total_size(struct agx_tilebuffer_layout *tib)
	{
	return tib->sample_size_B * tib->nr_samples * tib->tile_size.width *
	tib->tile_size.height;
	}

	void
	agx_tilebuffer_pack_usc(struct agx_tilebuffer_layout *tib)
	{
	agx_pack(&tib->usc, USC_SHARED, cfg) {
	if (tib->nr_samples > 0) {
	cfg.uses_shared_memory = true;
	cfg.layout = agx_shared_layout_from_tile_size(tib->tile_size);
	cfg.sample_stride_in_8_bytes = tib->sample_size_B / 8;
	cfg.sample_count = tib->nr_samples;
	cfg.bytes_per_threadgroup = agx_tilebuffer_total_size(tib);
	} else {
	cfg.layout = AGX_SHARED_LAYOUT_VERTEX_COMPUTE;
	cfg.bytes_per_threadgroup = 65536;
	}
	}
	}