src/intel/common/intel_tiled_render.h - third_party/mesa - Git at Google

 /*
  * Copyright © 2023 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */
 #ifndef INTEL_TILED_RENDER_H
 #define INTEL_TILED_RENDER_H

 #include "intel/common/intel_l3_config.h"
 #include "intel/dev/intel_device_info.h"
 #include "intel/isl/isl.h"

 /**
  * Return the tile cache space used as target by the tiling parameter
  * calculation algorithm below.  Cache space units are in bits.
  *
  * \sa intel_calculate_tile_dimensions()
  */
 UNUSED static unsigned
 intel_calculate_tile_cache_size(const struct intel_device_info *devinfo,
                                 const struct intel_l3_config *cfg)
 {
    const unsigned tc_l3_partition_size = 1024 * 8 *
       intel_get_l3_partition_size(devinfo, cfg, INTEL_L3P_TC);
    const unsigned all_l3_partition_size = 1024 * 8 *
       intel_get_l3_partition_size(devinfo, cfg, INTEL_L3P_ALL);
    /* Target half of the total L3 space as simple heuristic, could be
     * improved by adjusting the target dynamically.
     */
    const unsigned target_all_l3_partition_size = all_l3_partition_size / 2;
    /* If there's a tile cache partition on the L3, use its size as
     * target, otherwise (e.g. in unified L3 cache mode) use a fraction
     * of the total L3 available.
     *
     * XXX - Note that this assumes TBIMR in pixel hashing mode is in use.
     */
    const unsigned tile_cache_size = tc_l3_partition_size ? tc_l3_partition_size :
                                     target_all_l3_partition_size;
    assert(tile_cache_size > 0);
    return tile_cache_size;
 }

 /**
  * Return the amount of bits per pixel used to store an ISL surface in
  * memory.  This can be used as helper to estimate the value of the \p
  * pixel_size argument of intel_calculate_tile_dimensions() below.
  */
 UNUSED static unsigned
 intel_calculate_surface_pixel_size(const struct isl_surf *surf)
 {
    const struct isl_format_layout *layout = isl_format_get_layout(surf->format);
    const unsigned num_samples = MAX2(1, surf->samples);
    if (surf->size_B > 0)
       return DIV_ROUND_UP(layout->bpb * num_samples,
                           layout->bw * layout->bh * layout->bd);
    else
       return 0;
 }

 /**
  * Estimate tiling parameters that yield a reasonable balance between
  * tile cache utilization and avoidance of thrashing, based on the
  * device's current caching configuration, the framebuffer dimensions
  * and an estimate of the tile cache footprint per fragment in bits (\p
  * pixel_size).
  *
  * The calculated tile dimensions are guaranteed to be a multiple of
  * the block dimensions \p block_width and \p block_height, which for
  * TBIMR in pixel hashing mode must be equal to the pixel hashing
  * block size, typically 16x16 or 32x32.
  */
 UNUSED static void
 intel_calculate_tile_dimensions(const struct intel_device_info *devinfo,
                                 const struct intel_l3_config *cfg,
                                 unsigned block_width, unsigned block_height,
                                 unsigned fb_width, unsigned fb_height,
                                 unsigned pixel_size,
                                 unsigned *tile_width, unsigned *tile_height)
 {
    /* Maximum number of tiles supported by the TBIMR tile sequencing
     * hardware.
     */
    const unsigned max_horiz_tiles = 32;
    const unsigned max_vert_tiles = 32;

    /* Represent dimensions in hashing block units, which guarantees
     * that the resulting tile dimensions are a multiple of the hashing
     * block dimensions, a requirement of TBIMR in pixel hashing mode.
     */
    const unsigned fb_block_width = DIV_ROUND_UP(fb_width, block_width);
    const unsigned fb_block_height = DIV_ROUND_UP(fb_height, block_height);

    /* Amount of tile cache space for the workload to target. */
    const unsigned tile_cache_size = intel_calculate_tile_cache_size(devinfo, cfg);
    /* Cache footprint of a single hashing block worth of threads. */
    const unsigned block_size = MAX2(1, pixel_size * block_width * block_height);
    /* Calculate the desired tile surface (in block units) that fully
     * utilizes the target portion of the tile cache, which in an ideal
     * world where an oracle has given us the tile cache footprint per
     * block is just the ratio of the two.
     */
    const unsigned desired_tile_surf = MAX2(1, tile_cache_size / block_size);
    /* Clamp the desired tile surface to be between the surface of the
     * whole framebuffer and the surface of the smallest tile possible
     * at the maximum suported tile count.
     */
    const unsigned tile_surf = CLAMP(desired_tile_surf,
                                     (DIV_ROUND_UP(fb_block_width, max_horiz_tiles) *
                                      DIV_ROUND_UP(fb_block_height, max_vert_tiles)),
                                     fb_block_width * fb_block_height);
    /* XXX - If the tile_surf calculated above is smaller than the
     *       number of pixel pipes on the GPU, the pipeline is so
     *       cache-heavy that the parallelism of the GPU will have to
     *       be constrained in order to avoid thrashing the tile cache.
     *       Possibly emit a performance warning, or better, return an
     *       error indicating that the pixel pipe hashing config needs
     *       to be adjusted to use a finer hashing mode in order to
     *       spread out the workload evenly across the available slices.
     */

    /* Select the tile aspect ratio that minimizes the number of passes
     * required to render the whole framebuffer.  The search starts at
     * an approximately square tile size of the desired surface and
     * increases the ratio between its major and minor axes in a
     * sequence of finite increments.
     *
     * The algorithm is biased in favor of the squarest possible tiling
     * config since it starts with a tile shape closest to a square and
     * early-exits when a global minimum is detected.  This bias is
     * intentional since cache locality may suffer at high tile aspect
     * ratios.
     */
    const float base_major = sqrtf(tile_surf);
    /* Make sure that the minimum major axis where the search starts
     * isn't so small (due to a small framebuffer or rounding) that the
     * tile would have to be larger than the framebuffer in the
     * opposite "minor" direction.
     */
    const unsigned min_major = MAX3(1, floorf(base_major),
 				   tile_surf / MIN2(fb_block_width, fb_block_height));
    /* Stop search at a an aspect ratio of approximately 2 (A major
     * axis equal to 'base_major * M_SQRT2' would give an aspect ratio
     * of exactly 2 if it was a valid integer number).  Aspect ratios
     * higher than 2 could technically be useful, the upper bound is
     * intended as a heuristic in order to set a low limit to the
     * number of iterations the loop below may execute.
     */
    const unsigned max_major = ceilf(MAX2(base_major, min_major) * M_SQRT2);
    assert(max_major < INT_MAX);

    /* Best tile dimensions found so far. */
    unsigned best_count = UINT_MAX;
    unsigned best_block_width = 0;
    unsigned best_block_height = 0;

    for (unsigned major = min_major; major <= max_major;) {
       /* Minor axis that yields the desired tile surface for the
        * present major parameter.
        */
       const unsigned minor = MAX2(1, tile_surf / major);

       /* Calculate the total number of tiles if this aspect ratio is
        * used in the X-major orientation.
        */
       const unsigned horiz_tiles_x = DIV_ROUND_UP(fb_block_width, major);
       const unsigned vert_tiles_x = DIV_ROUND_UP(fb_block_height, minor);
       const unsigned count_x = horiz_tiles_x * vert_tiles_x;

       /* Calculate the number of blocks we need to add to the major
        * axis for the number of X-major tile columns (horiz_tiles_x)
        * to drop by one.  This avoids many useless iterations relative
        * to exhaustive search, since an increase in major can only
        * decrease the total tile count if it decreases horiz_tiles_x
        * as well, vert_tiles_x is monotonically increasing with major.
        *
        * If the number of tile columns is already 1 the X-major
        * solution cannot be improved further, use "infinity" so the
        * increment for the next iteration is only determined by the
        * Y-major search -- If the Y-major solution cannot be improved
        * either the search will be terminated.
        */
       const unsigned delta_x = horiz_tiles_x == 1 ? INT_MAX :
          DIV_ROUND_UP(fb_block_width - major * (horiz_tiles_x - 1),
                       horiz_tiles_x - 1);

       /* Update the best known solution with the present X-major one
        * if it's allowed by the hardware and requires a lower total
        * number of tiles to cover the whole framebuffer.
        */
       if (horiz_tiles_x <= max_horiz_tiles && vert_tiles_x <= max_vert_tiles &&
           count_x < best_count) {
          best_count = count_x;
          best_block_width = major;
          best_block_height = minor;

          /* The array of tiles is fully covered by the framebuffer, a
           * global minimum has been found, terminate the search.
           */
          if (count_x * tile_surf == fb_block_width * fb_block_height)
             break;
       }

       /* Calculate the total number of tiles if this aspect ratio is
        * used in the Y-major orientation.
        */
       const unsigned horiz_tiles_y = DIV_ROUND_UP(fb_block_width, minor);
       const unsigned vert_tiles_y = DIV_ROUND_UP(fb_block_height, major);
       const unsigned count_y =  horiz_tiles_y * vert_tiles_y;

       /* Calculate the number of blocks we need to add to the major
        * axis for the number of Y-major tile rows (vert_tiles_y) to
        * drop by one.  Analogous to the delta_x described above after
        * a flip of the X and Y axes.
        */
       const unsigned delta_y = vert_tiles_y == 1 ? INT_MAX :
          DIV_ROUND_UP(fb_block_height - major * (vert_tiles_y - 1),
                       vert_tiles_y - 1);

       /* Update the best known solution with the present Y-major one
        * if it's allowed by the hardware and requires a lower total
        * number of tiles to cover the whole framebuffer.
        */
       if (horiz_tiles_y <= max_horiz_tiles && vert_tiles_y <= max_vert_tiles &&
           count_y < best_count) {
          best_count = count_y;
          best_block_width = minor;
          best_block_height = major;

          /* The array of tiles is fully covered by the framebuffer, a
           * global minimum has been found, terminate the search.
           */
          if (count_y * tile_surf == fb_block_width * fb_block_height)
             break;
       }

       /* Use the smallest of the computed major increments in order to
        * visit the closest subsequent solution candidate.  If both the
        * X-major and Y-major searches have terminated major will be
        * pushed above the upper bound of the search, causing immediate
        * termination.
        */
       const unsigned delta = MIN2(delta_x, delta_y);
       assert(major + delta > major);
       major += delta;
    }

    /* Sanity-check and return the result, scaling it back to pixel
     * units.
     */
    assert(best_block_width > 0 && best_block_height > 0);
    assert(DIV_ROUND_UP(fb_block_width, best_block_width) <= max_horiz_tiles);
    assert(DIV_ROUND_UP(fb_block_height, best_block_height) <= max_vert_tiles);

    *tile_width = best_block_width * block_width;
    *tile_height = best_block_height * block_height;
 }

 #endif
	/*
	* Copyright © 2023 Intel Corporation
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
	* IN THE SOFTWARE.
	*/
	#ifndef INTEL_TILED_RENDER_H
	#define INTEL_TILED_RENDER_H

	#include "intel/common/intel_l3_config.h"
	#include "intel/dev/intel_device_info.h"
	#include "intel/isl/isl.h"

	/**
	* Return the tile cache space used as target by the tiling parameter
	* calculation algorithm below. Cache space units are in bits.
	*
	* \sa intel_calculate_tile_dimensions()
	*/
	UNUSED static unsigned
	intel_calculate_tile_cache_size(const struct intel_device_info *devinfo,
	const struct intel_l3_config *cfg)
	{
	const unsigned tc_l3_partition_size = 1024 * 8 *
	intel_get_l3_partition_size(devinfo, cfg, INTEL_L3P_TC);
	const unsigned all_l3_partition_size = 1024 * 8 *
	intel_get_l3_partition_size(devinfo, cfg, INTEL_L3P_ALL);
	/* Target half of the total L3 space as simple heuristic, could be
	* improved by adjusting the target dynamically.
	*/
	const unsigned target_all_l3_partition_size = all_l3_partition_size / 2;
	/* If there's a tile cache partition on the L3, use its size as
	* target, otherwise (e.g. in unified L3 cache mode) use a fraction
	* of the total L3 available.
	*
	* XXX - Note that this assumes TBIMR in pixel hashing mode is in use.
	*/
	const unsigned tile_cache_size = tc_l3_partition_size ? tc_l3_partition_size :
	target_all_l3_partition_size;
	assert(tile_cache_size > 0);
	return tile_cache_size;
	}

	/**
	* Return the amount of bits per pixel used to store an ISL surface in
	* memory. This can be used as helper to estimate the value of the \p
	* pixel_size argument of intel_calculate_tile_dimensions() below.
	*/
	UNUSED static unsigned
	intel_calculate_surface_pixel_size(const struct isl_surf *surf)
	{
	const struct isl_format_layout *layout = isl_format_get_layout(surf->format);
	const unsigned num_samples = MAX2(1, surf->samples);
	if (surf->size_B > 0)
	return DIV_ROUND_UP(layout->bpb * num_samples,
	layout->bw * layout->bh * layout->bd);
	else
	return 0;
	}

	/**
	* Estimate tiling parameters that yield a reasonable balance between
	* tile cache utilization and avoidance of thrashing, based on the
	* device's current caching configuration, the framebuffer dimensions
	* and an estimate of the tile cache footprint per fragment in bits (\p
	* pixel_size).
	*
	* The calculated tile dimensions are guaranteed to be a multiple of
	* the block dimensions \p block_width and \p block_height, which for
	* TBIMR in pixel hashing mode must be equal to the pixel hashing
	* block size, typically 16x16 or 32x32.
	*/
	UNUSED static void
	intel_calculate_tile_dimensions(const struct intel_device_info *devinfo,
	const struct intel_l3_config *cfg,
	unsigned block_width, unsigned block_height,
	unsigned fb_width, unsigned fb_height,
	unsigned pixel_size,
	unsigned tile_width, unsigned tile_height)
	{
	/* Maximum number of tiles supported by the TBIMR tile sequencing
	* hardware.
	*/
	const unsigned max_horiz_tiles = 32;
	const unsigned max_vert_tiles = 32;

	/* Represent dimensions in hashing block units, which guarantees
	* that the resulting tile dimensions are a multiple of the hashing
	* block dimensions, a requirement of TBIMR in pixel hashing mode.
	*/
	const unsigned fb_block_width = DIV_ROUND_UP(fb_width, block_width);
	const unsigned fb_block_height = DIV_ROUND_UP(fb_height, block_height);

	/* Amount of tile cache space for the workload to target. */
	const unsigned tile_cache_size = intel_calculate_tile_cache_size(devinfo, cfg);
	/* Cache footprint of a single hashing block worth of threads. */
	const unsigned block_size = MAX2(1, pixel_size * block_width * block_height);
	/* Calculate the desired tile surface (in block units) that fully
	* utilizes the target portion of the tile cache, which in an ideal
	* world where an oracle has given us the tile cache footprint per
	* block is just the ratio of the two.
	*/
	const unsigned desired_tile_surf = MAX2(1, tile_cache_size / block_size);
	/* Clamp the desired tile surface to be between the surface of the
	* whole framebuffer and the surface of the smallest tile possible
	* at the maximum suported tile count.
	*/
	const unsigned tile_surf = CLAMP(desired_tile_surf,
	(DIV_ROUND_UP(fb_block_width, max_horiz_tiles) *
	DIV_ROUND_UP(fb_block_height, max_vert_tiles)),
	fb_block_width * fb_block_height);
	/* XXX - If the tile_surf calculated above is smaller than the
	* number of pixel pipes on the GPU, the pipeline is so
	* cache-heavy that the parallelism of the GPU will have to
	* be constrained in order to avoid thrashing the tile cache.
	* Possibly emit a performance warning, or better, return an
	* error indicating that the pixel pipe hashing config needs
	* to be adjusted to use a finer hashing mode in order to
	* spread out the workload evenly across the available slices.
	*/

	/* Select the tile aspect ratio that minimizes the number of passes
	* required to render the whole framebuffer. The search starts at
	* an approximately square tile size of the desired surface and
	* increases the ratio between its major and minor axes in a
	* sequence of finite increments.
	*
	* The algorithm is biased in favor of the squarest possible tiling
	* config since it starts with a tile shape closest to a square and
	* early-exits when a global minimum is detected. This bias is
	* intentional since cache locality may suffer at high tile aspect
	* ratios.
	*/
	const float base_major = sqrtf(tile_surf);
	/* Make sure that the minimum major axis where the search starts
	* isn't so small (due to a small framebuffer or rounding) that the
	* tile would have to be larger than the framebuffer in the
	* opposite "minor" direction.
	*/
	const unsigned min_major = MAX3(1, floorf(base_major),
	tile_surf / MIN2(fb_block_width, fb_block_height));
	/* Stop search at a an aspect ratio of approximately 2 (A major
	* axis equal to 'base_major * M_SQRT2' would give an aspect ratio
	* of exactly 2 if it was a valid integer number). Aspect ratios
	* higher than 2 could technically be useful, the upper bound is
	* intended as a heuristic in order to set a low limit to the
	* number of iterations the loop below may execute.
	*/
	const unsigned max_major = ceilf(MAX2(base_major, min_major) * M_SQRT2);
	assert(max_major < INT_MAX);

	/* Best tile dimensions found so far. */
	unsigned best_count = UINT_MAX;
	unsigned best_block_width = 0;
	unsigned best_block_height = 0;

	for (unsigned major = min_major; major <= max_major;) {
	/* Minor axis that yields the desired tile surface for the
	* present major parameter.
	*/
	const unsigned minor = MAX2(1, tile_surf / major);

	/* Calculate the total number of tiles if this aspect ratio is
	* used in the X-major orientation.
	*/
	const unsigned horiz_tiles_x = DIV_ROUND_UP(fb_block_width, major);
	const unsigned vert_tiles_x = DIV_ROUND_UP(fb_block_height, minor);
	const unsigned count_x = horiz_tiles_x * vert_tiles_x;

	/* Calculate the number of blocks we need to add to the major
	* axis for the number of X-major tile columns (horiz_tiles_x)
	* to drop by one. This avoids many useless iterations relative
	* to exhaustive search, since an increase in major can only
	* decrease the total tile count if it decreases horiz_tiles_x
	* as well, vert_tiles_x is monotonically increasing with major.
	*
	* If the number of tile columns is already 1 the X-major
	* solution cannot be improved further, use "infinity" so the
	* increment for the next iteration is only determined by the
	* Y-major search -- If the Y-major solution cannot be improved
	* either the search will be terminated.
	*/
	const unsigned delta_x = horiz_tiles_x == 1 ? INT_MAX :
	DIV_ROUND_UP(fb_block_width - major * (horiz_tiles_x - 1),
	horiz_tiles_x - 1);

	/* Update the best known solution with the present X-major one
	* if it's allowed by the hardware and requires a lower total
	* number of tiles to cover the whole framebuffer.
	*/
	if (horiz_tiles_x <= max_horiz_tiles && vert_tiles_x <= max_vert_tiles &&
	count_x < best_count) {
	best_count = count_x;
	best_block_width = major;
	best_block_height = minor;

	/* The array of tiles is fully covered by the framebuffer, a
	* global minimum has been found, terminate the search.
	*/
	if (count_x * tile_surf == fb_block_width * fb_block_height)
	break;
	}

	/* Calculate the total number of tiles if this aspect ratio is
	* used in the Y-major orientation.
	*/
	const unsigned horiz_tiles_y = DIV_ROUND_UP(fb_block_width, minor);
	const unsigned vert_tiles_y = DIV_ROUND_UP(fb_block_height, major);
	const unsigned count_y = horiz_tiles_y * vert_tiles_y;

	/* Calculate the number of blocks we need to add to the major
	* axis for the number of Y-major tile rows (vert_tiles_y) to
	* drop by one. Analogous to the delta_x described above after
	* a flip of the X and Y axes.
	*/
	const unsigned delta_y = vert_tiles_y == 1 ? INT_MAX :
	DIV_ROUND_UP(fb_block_height - major * (vert_tiles_y - 1),
	vert_tiles_y - 1);

	/* Update the best known solution with the present Y-major one
	* if it's allowed by the hardware and requires a lower total
	* number of tiles to cover the whole framebuffer.
	*/
	if (horiz_tiles_y <= max_horiz_tiles && vert_tiles_y <= max_vert_tiles &&
	count_y < best_count) {
	best_count = count_y;
	best_block_width = minor;
	best_block_height = major;

	/* The array of tiles is fully covered by the framebuffer, a
	* global minimum has been found, terminate the search.
	*/
	if (count_y * tile_surf == fb_block_width * fb_block_height)
	break;
	}

	/* Use the smallest of the computed major increments in order to
	* visit the closest subsequent solution candidate. If both the
	* X-major and Y-major searches have terminated major will be
	* pushed above the upper bound of the search, causing immediate
	* termination.
	*/
	const unsigned delta = MIN2(delta_x, delta_y);
	assert(major + delta > major);
	major += delta;
	}

	/* Sanity-check and return the result, scaling it back to pixel
	* units.
	*/
	assert(best_block_width > 0 && best_block_height > 0);
	assert(DIV_ROUND_UP(fb_block_width, best_block_width) <= max_horiz_tiles);
	assert(DIV_ROUND_UP(fb_block_height, best_block_height) <= max_vert_tiles);

	tile_width = best_block_width block_width;
	tile_height = best_block_height block_height;
	}

	#endif