blob: f666fd0c39965586eafe7352aafcbdf847b5fbc7 [file] [log] [blame]
/*
* Copyright © 2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_eu.h"
#include "brw_nir.h"
#include "compiler/nir/nir.h"
#include "util/u_dynarray.h"
/**
* \file brw_nir_analyze_ubo_ranges.c
*
* This pass decides which portions of UBOs to upload as push constants,
* so shaders can access them as part of the thread payload, rather than
* having to issue expensive memory reads to pull the data.
*
* The 3DSTATE_CONSTANT_* mechanism can push data from up to 4 different
* buffers, in GRF sized units. This was always 256 bits (32 bytes).
* Starting with Xe2, it is 512 bits (64 bytes).
*
* To do this, we examine NIR load_ubo intrinsics, recording the number of
* loads at each offset. We track offsets at a sizeof(GRF) granularity, so even
* fields with a bit of padding between them tend to fall into contiguous
* ranges. We build a list of these ranges, tracking their "cost" (number
* of registers required) and "benefit" (number of pull loads eliminated
* by pushing the range). We then sort the list to obtain the four best
* ranges (most benefit for the least cost).
*/
struct ubo_range_entry
{
struct brw_ubo_range range;
int benefit;
};
static int
score(const struct ubo_range_entry *entry)
{
return 2 * entry->benefit - entry->range.length;
}
/**
* Compares score for two UBO range entries.
*
* For a descending qsort().
*/
static int
cmp_ubo_range_entry(const void *va, const void *vb)
{
const struct ubo_range_entry *a = va;
const struct ubo_range_entry *b = vb;
/* Rank based on scores, descending order */
int delta = score(b) - score(a);
/* Then use the UBO block index as a tie-breaker, descending order */
if (delta == 0)
delta = b->range.block - a->range.block;
/* Finally use the start offset as a second tie-breaker, ascending order */
if (delta == 0)
delta = a->range.start - b->range.start;
return delta;
}
struct ubo_block_info
{
/* Each bit in the offsets bitfield represents a 32-byte section of data.
* If it's set to one, there is interesting UBO data at that offset. If
* not, there's a "hole" - padding between data - or just nothing at all.
*/
uint64_t offsets;
uint8_t uses[64];
};
struct ubo_analysis_state
{
struct hash_table *blocks;
const struct intel_device_info *devinfo;
};
static struct ubo_block_info *
get_block_info(struct ubo_analysis_state *state, int block)
{
uint32_t hash = block + 1;
void *key = (void *) (uintptr_t) hash;
struct hash_entry *entry =
_mesa_hash_table_search_pre_hashed(state->blocks, hash, key);
if (entry)
return (struct ubo_block_info *) entry->data;
struct ubo_block_info *info =
rzalloc(state->blocks, struct ubo_block_info);
_mesa_hash_table_insert_pre_hashed(state->blocks, hash, key, info);
return info;
}
static void
analyze_ubos_block(struct ubo_analysis_state *state, nir_block *block)
{
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_ubo)
continue;
if (brw_nir_ubo_surface_index_is_pushable(intrin->src[0]) &&
nir_src_is_const(intrin->src[1])) {
const int block = brw_nir_ubo_surface_index_get_push_block(intrin->src[0]);
const unsigned byte_offset = nir_src_as_uint(intrin->src[1]);
const unsigned sizeof_GRF = REG_SIZE * reg_unit(state->devinfo);
const int offset = byte_offset / sizeof_GRF;
/* Avoid shifting by larger than the width of our bitfield, as this
* is undefined in C. Even if we require multiple bits to represent
* the entire value, it's OK to record a partial value - the backend
* is capable of falling back to pull loads for later components of
* vectors, as it has to shrink ranges for other reasons anyway.
*/
if (offset >= 64)
continue;
/* The value might span multiple sizeof(GRF) chunks. */
const unsigned num_components =
nir_def_last_component_read(&intrin->def) + 1;
const int bytes = num_components * (intrin->def.bit_size / 8);
const int start = ROUND_DOWN_TO(byte_offset, sizeof_GRF);
const int end = ALIGN(byte_offset + bytes, sizeof_GRF);
const int chunks = (end - start) / sizeof_GRF;
/* TODO: should we count uses in loops as higher benefit? */
struct ubo_block_info *info = get_block_info(state, block);
info->offsets |= ((1ull << chunks) - 1) << offset;
info->uses[offset]++;
}
}
}
static void
print_ubo_entry(FILE *file,
const struct ubo_range_entry *entry,
struct ubo_analysis_state *state)
{
struct ubo_block_info *info = get_block_info(state, entry->range.block);
fprintf(file,
"block %2d, start %2d, length %2d, bits = %"PRIx64", "
"benefit %2d, cost %2d, score = %2d\n",
entry->range.block, entry->range.start, entry->range.length,
info->offsets, entry->benefit, entry->range.length, score(entry));
}
void
brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
nir_shader *nir,
struct brw_ubo_range out_ranges[4])
{
void *mem_ctx = ralloc_context(NULL);
struct ubo_analysis_state state = {
.blocks =
_mesa_hash_table_create(mem_ctx, NULL, _mesa_key_pointer_equal),
.devinfo = compiler->devinfo,
};
/* Walk the IR, recording how many times each UBO block/offset is used. */
nir_foreach_function_impl(impl, nir) {
nir_foreach_block(block, impl) {
analyze_ubos_block(&state, block);
}
}
/* Find ranges: a block, starting register-size aligned byte offset, and
* length.
*/
struct util_dynarray ranges;
util_dynarray_init(&ranges, mem_ctx);
hash_table_foreach(state.blocks, entry) {
const int b = entry->hash - 1;
const struct ubo_block_info *info = entry->data;
uint64_t offsets = info->offsets;
/* Walk through the offsets bitfield, finding contiguous regions of
* set bits:
*
* 0000000001111111111111000000000000111111111111110000000011111100
* ^^^^^^^^^^^^^ ^^^^^^^^^^^^^^ ^^^^^^
*
* Each of these will become a UBO range.
*/
while (offsets != 0) {
/* Find the first 1 in the offsets bitfield. This represents the
* start of a range of interesting UBO data. Make it zero-indexed.
*/
int first_bit = ffsll(offsets) - 1;
/* Find the first 0 bit in offsets beyond first_bit. To find the
* first zero bit, we find the first 1 bit in the complement. In
* order to ignore bits before first_bit, we mask off those bits.
*/
int first_hole = ffsll(~offsets & ~((1ull << first_bit) - 1)) - 1;
if (first_hole == -1) {
/* If we didn't find a hole, then set it to the end of the
* bitfield. There are no more ranges to process.
*/
first_hole = 64;
offsets = 0;
} else {
/* We've processed all bits before first_hole. Mask them off. */
offsets &= ~((1ull << first_hole) - 1);
}
struct ubo_range_entry *entry =
util_dynarray_grow(&ranges, struct ubo_range_entry, 1);
entry->range.block = b;
entry->range.start = first_bit;
/* first_hole is one beyond the end, so we don't need to add 1 */
entry->range.length = first_hole - first_bit;
entry->benefit = 0;
for (int i = 0; i < entry->range.length; i++)
entry->benefit += info->uses[first_bit + i];
}
}
int nr_entries = ranges.size / sizeof(struct ubo_range_entry);
if (0) {
util_dynarray_foreach(&ranges, struct ubo_range_entry, entry) {
print_ubo_entry(stderr, entry, &state);
}
}
/* TODO: Consider combining ranges.
*
* We can only push 4 ranges via 3DSTATE_CONSTANT_XS. If there are
* more ranges, and two are close by with only a small hole, it may be
* worth combining them. The holes will waste register space, but the
* benefit of removing pulls may outweigh that cost.
*/
/* Sort the list so the most beneficial ranges are at the front. */
if (nr_entries > 0) {
qsort(ranges.data, nr_entries, sizeof(struct ubo_range_entry),
cmp_ubo_range_entry);
}
struct ubo_range_entry *entries = ranges.data;
/* Return the top 4, limited to the maximum number of push registers.
*
* The Vulkan driver sets up additional non-UBO push constants, so it may
* need to shrink these ranges further (see anv_nir_compute_push_layout.c).
* The OpenGL driver treats legacy uniforms as a UBO, so this is enough.
*
* To limit further, simply drop the tail of the list, as that's the least
* valuable portion.
*/
const int max_ubos = 4;
nr_entries = MIN2(nr_entries, max_ubos);
const unsigned max_push_regs = 64 / reg_unit(compiler->devinfo);
unsigned total_push_regs = 0;
for (unsigned i = 0; i < nr_entries; i++) {
if (total_push_regs + entries[i].range.length > max_push_regs)
entries[i].range.length = max_push_regs - total_push_regs;
total_push_regs += entries[i].range.length;
}
for (int i = 0; i < nr_entries; i++) {
out_ranges[i] = entries[i].range;
/* To this point, various values have been tracked in terms of the real
* hardware register sizes. However, the rest of the compiler expects
* values in terms of pre-Xe2 256-bit registers. Scale start and length
* to account for this.
*/
out_ranges[i].start *= reg_unit(compiler->devinfo);
out_ranges[i].length *= reg_unit(compiler->devinfo);
}
for (int i = nr_entries; i < 4; i++) {
out_ranges[i].block = 0;
out_ranges[i].start = 0;
out_ranges[i].length = 0;
}
ralloc_free(ranges.mem_ctx);
}