blob: 5145da9105fae44bbd7816d90a4d961b69586ca4 [file] [log] [blame]
/*
* Copyright © 2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <assert.h>
#include <stdbool.h>
#include "anv_private.h"
#include "anv_measure.h"
#include "genxml/gen_macros.h"
#include "genxml/genX_pack.h"
#include "common/intel_genX_state_brw.h"
#include "ds/intel_tracepoints.h"
#include "genX_mi_builder.h"
static void
cmd_buffer_alloc_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer)
{
struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
VkShaderStageFlags stages = pipeline->base.base.active_stages;
/* In order to avoid thrash, we assume that vertex and fragment stages
* always exist. In the rare case where one is missing *and* the other
* uses push concstants, this may be suboptimal. However, avoiding stalls
* seems more important.
*/
stages |= VK_SHADER_STAGE_FRAGMENT_BIT;
if (anv_pipeline_is_primitive(pipeline))
stages |= VK_SHADER_STAGE_VERTEX_BIT;
if (stages == cmd_buffer->state.gfx.push_constant_stages)
return;
unsigned push_constant_kb;
const struct intel_device_info *devinfo = cmd_buffer->device->info;
if (anv_pipeline_is_mesh(pipeline))
push_constant_kb = devinfo->mesh_max_constant_urb_size_kb;
else
push_constant_kb = devinfo->max_constant_urb_size_kb;
const unsigned num_stages =
util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
unsigned size_per_stage = push_constant_kb / num_stages;
/* Broadwell+ and Haswell gt3 require that the push constant sizes be in
* units of 2KB. Incidentally, these are the same platforms that have
* 32KB worth of push constant space.
*/
if (push_constant_kb == 32)
size_per_stage &= ~1u;
uint32_t kb_used = 0;
for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
const unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
anv_batch_emit(&cmd_buffer->batch,
GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
alloc._3DCommandSubOpcode = 18 + i;
alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
alloc.ConstantBufferSize = push_size;
}
kb_used += push_size;
}
anv_batch_emit(&cmd_buffer->batch,
GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
alloc.ConstantBufferOffset = kb_used;
alloc.ConstantBufferSize = push_constant_kb - kb_used;
}
#if GFX_VERx10 == 125
/* DG2: Wa_22011440098
* MTL: Wa_18022330953
*
* In 3D mode, after programming push constant alloc command immediately
* program push constant command(ZERO length) without any commit between
* them.
*/
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
/* Update empty push constants for all stages (bitmask = 11111b) */
c.ShaderUpdateEnable = 0x1f;
c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
}
#endif
cmd_buffer->state.gfx.push_constant_stages = stages;
/* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
*
* "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
* the next 3DPRIMITIVE command after programming the
* 3DSTATE_PUSH_CONSTANT_ALLOC_VS"
*
* Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
* pipeline setup, we need to dirty push constants.
*/
cmd_buffer->state.push_constants_dirty |= stages;
}
static void
cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
uint32_t stages)
{
static const uint32_t sampler_state_opcodes[] = {
[MESA_SHADER_VERTEX] = 43,
[MESA_SHADER_TESS_CTRL] = 44, /* HS */
[MESA_SHADER_TESS_EVAL] = 45, /* DS */
[MESA_SHADER_GEOMETRY] = 46,
[MESA_SHADER_FRAGMENT] = 47,
};
static const uint32_t binding_table_opcodes[] = {
[MESA_SHADER_VERTEX] = 38,
[MESA_SHADER_TESS_CTRL] = 39,
[MESA_SHADER_TESS_EVAL] = 40,
[MESA_SHADER_GEOMETRY] = 41,
[MESA_SHADER_FRAGMENT] = 42,
};
anv_foreach_stage(s, stages) {
assert(s < ARRAY_SIZE(binding_table_opcodes));
if (cmd_buffer->state.samplers[s].alloc_size > 0) {
anv_batch_emit(&cmd_buffer->batch,
GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
}
}
/* Always emit binding table pointers if we're asked to, since on SKL
* this is what flushes push constants. */
anv_batch_emit(&cmd_buffer->batch,
GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
btp._3DCommandSubOpcode = binding_table_opcodes[s];
btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
}
}
}
static struct anv_address
get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
const struct anv_shader_bin *shader,
const struct anv_push_range *range)
{
struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
switch (range->set) {
case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
/* This is a descriptor set buffer so the set index is
* actually given by binding->binding. (Yes, that's
* confusing.)
*/
struct anv_descriptor_set *set =
gfx_state->base.descriptors[range->index];
return anv_descriptor_set_address(set);
}
case ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER: {
return anv_address_from_u64(
anv_cmd_buffer_descriptor_buffer_address(
cmd_buffer,
gfx_state->base.descriptor_buffers[range->index].buffer_index) +
gfx_state->base.descriptor_buffers[range->index].buffer_offset);
}
case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
if (gfx_state->base.push_constants_state.alloc_size == 0) {
gfx_state->base.push_constants_state =
anv_cmd_buffer_gfx_push_constants(cmd_buffer);
}
return anv_cmd_buffer_gfx_push_constants_state_address(
cmd_buffer, gfx_state->base.push_constants_state);
}
case ANV_DESCRIPTOR_SET_NULL:
case ANV_DESCRIPTOR_SET_PER_PRIM_PADDING:
return cmd_buffer->device->workaround_address;
default: {
assert(range->set < MAX_SETS);
struct anv_descriptor_set *set =
gfx_state->base.descriptors[range->set];
const struct anv_descriptor *desc =
&set->descriptors[range->index];
if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
if (desc->buffer) {
return anv_address_add(desc->buffer->address,
desc->offset);
}
} else {
assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
if (desc->buffer) {
const struct anv_cmd_pipeline_state *pipe_state = &gfx_state->base;
uint32_t dynamic_offset =
pipe_state->dynamic_offsets[
range->set].offsets[range->dynamic_offset_index];
return anv_address_add(desc->buffer->address,
desc->offset + dynamic_offset);
}
}
/* For NULL UBOs, we just return an address in the workaround BO. We do
* writes to it for workarounds but always at the bottom. The higher
* bytes should be all zeros.
*/
assert(range->length * 32 <= 2048);
return cmd_buffer->device->workaround_address;
}
}
}
/** Returns the size in bytes of the bound buffer
*
* The range is relative to the start of the buffer, not the start of the
* range. The returned range may be smaller than
*
* (range->start + range->length) * 32;
*/
static uint32_t
get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
const struct anv_shader_bin *shader,
const struct anv_push_range *range)
{
assert(shader->stage != MESA_SHADER_COMPUTE);
const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
switch (range->set) {
case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
struct anv_descriptor_set *set =
gfx_state->base.descriptors[range->index];
struct anv_state state = set->desc_surface_mem;
assert(range->start * 32 < state.alloc_size);
assert((range->start + range->length) * 32 <= state.alloc_size);
return state.alloc_size;
}
case ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER:
return gfx_state->base.pipeline->layout.set[
range->index].layout->descriptor_buffer_surface_size;
case ANV_DESCRIPTOR_SET_NULL:
case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
case ANV_DESCRIPTOR_SET_PER_PRIM_PADDING:
return (range->start + range->length) * 32;
default: {
assert(range->set < MAX_SETS);
struct anv_descriptor_set *set =
gfx_state->base.descriptors[range->set];
const struct anv_descriptor *desc =
&set->descriptors[range->index];
if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
/* Here we promote a UBO to a binding table entry so that we can avoid a layer of indirection.
* We use the descriptor set's internally allocated surface state to fill the binding table entry.
*/
if (!desc->buffer)
return 0;
if (range->start * 32 > desc->bind_range)
return 0;
return desc->bind_range;
} else {
if (!desc->buffer)
return 0;
assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
/* Compute the offset within the buffer */
const struct anv_cmd_pipeline_state *pipe_state = &gfx_state->base;
uint32_t dynamic_offset =
pipe_state->dynamic_offsets[
range->set].offsets[range->dynamic_offset_index];
uint64_t offset = desc->offset + dynamic_offset;
/* Clamp to the buffer size */
offset = MIN2(offset, desc->buffer->vk.size);
/* Clamp the range to the buffer size */
uint32_t bound_range = MIN2(desc->range, desc->buffer->vk.size - offset);
/* Align the range for consistency */
bound_range = align(bound_range, ANV_UBO_ALIGNMENT);
return bound_range;
}
}
}
}
static void
cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,
gl_shader_stage stage,
struct anv_address *buffers,
unsigned buffer_count)
{
const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
const struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(gfx_state->base.pipeline);
static const uint32_t push_constant_opcodes[] = {
[MESA_SHADER_VERTEX] = 21,
[MESA_SHADER_TESS_CTRL] = 25, /* HS */
[MESA_SHADER_TESS_EVAL] = 26, /* DS */
[MESA_SHADER_GEOMETRY] = 22,
[MESA_SHADER_FRAGMENT] = 23,
};
assert(stage < ARRAY_SIZE(push_constant_opcodes));
UNUSED uint32_t mocs = anv_mocs(cmd_buffer->device, NULL, 0);
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
c._3DCommandSubOpcode = push_constant_opcodes[stage];
/* Set MOCS.
*
* We only have one MOCS field for the whole packet, not one per
* buffer. We could go out of our way here to walk over all of
* the buffers and see if any of them are used externally and use
* the external MOCS. However, the notion that someone would use
* the same bit of memory for both scanout and a UBO is nuts.
*
* Let's not bother and assume it's all internal.
*/
c.MOCS = mocs;
if (anv_pipeline_has_stage(pipeline, stage)) {
const struct anv_pipeline_bind_map *bind_map =
&pipeline->base.shaders[stage]->bind_map;
/* The Skylake PRM contains the following restriction:
*
* "The driver must ensure The following case does not occur
* without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
* buffer 3 read length equal to zero committed followed by a
* 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
* zero committed."
*
* To avoid this, we program the buffers in the highest slots.
* This way, slot 0 is only used if slot 3 is also used.
*/
assert(buffer_count <= 4);
const unsigned shift = 4 - buffer_count;
for (unsigned i = 0; i < buffer_count; i++) {
const struct anv_push_range *range = &bind_map->push_ranges[i];
/* At this point we only have non-empty ranges */
assert(range->length > 0);
c.ConstantBody.ReadLength[i + shift] = range->length;
c.ConstantBody.Buffer[i + shift] =
anv_address_add(buffers[i], range->start * 32);
}
}
}
}
#if GFX_VER >= 12
static void
cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer,
uint32_t shader_mask,
struct anv_address *buffers,
uint32_t buffer_count)
{
if (buffer_count == 0) {
if (shader_mask) {
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
c.ShaderUpdateEnable = shader_mask;
c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
}
}
return;
}
const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
const struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(gfx_state->base.pipeline);
gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask);
const struct anv_pipeline_bind_map *bind_map =
&pipeline->base.shaders[stage]->bind_map;
uint32_t *dw;
const uint32_t buffer_mask = (1 << buffer_count) - 1;
const uint32_t num_dwords = 2 + 2 * buffer_count;
dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
GENX(3DSTATE_CONSTANT_ALL),
.ShaderUpdateEnable = shader_mask,
.PointerBufferMask = buffer_mask,
.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false));
for (int i = 0; i < buffer_count; i++) {
const struct anv_push_range *range = &bind_map->push_ranges[i];
GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
&cmd_buffer->batch, dw + 2 + i * 2,
&(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
.PointerToConstantBuffer =
anv_address_add(buffers[i], range->start * 32),
.ConstantBufferReadLength = range->length,
});
}
}
#endif
static void
cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer,
VkShaderStageFlags dirty_stages)
{
VkShaderStageFlags flushed = 0;
struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
const struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(gfx_state->base.pipeline);
#if GFX_VER >= 12
uint32_t nobuffer_stages = 0;
#endif
/* Compute robust pushed register access mask for each stage. */
anv_foreach_stage(stage, dirty_stages) {
if (!anv_pipeline_has_stage(pipeline, stage))
continue;
const struct anv_shader_bin *shader = pipeline->base.shaders[stage];
if (shader->prog_data->zero_push_reg) {
const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
struct anv_push_constants *push = &gfx_state->base.push_constants;
push->gfx.push_reg_mask[stage] = 0;
/* Start of the current range in the shader, relative to the start of
* push constants in the shader.
*/
unsigned range_start_reg = 0;
for (unsigned i = 0; i < 4; i++) {
const struct anv_push_range *range = &bind_map->push_ranges[i];
if (range->length == 0)
continue;
/* Never clear this padding register as it might contain payload
* data.
*/
if (range->set == ANV_DESCRIPTOR_SET_PER_PRIM_PADDING)
continue;
unsigned bound_size =
get_push_range_bound_size(cmd_buffer, shader, range);
if (bound_size >= range->start * 32) {
unsigned bound_regs =
MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
range->length);
assert(range_start_reg + bound_regs <= 64);
push->gfx.push_reg_mask[stage] |=
BITFIELD64_RANGE(range_start_reg, bound_regs);
}
cmd_buffer->state.push_constants_dirty |=
mesa_to_vk_shader_stage(stage);
gfx_state->base.push_constants_data_dirty = true;
range_start_reg += range->length;
}
}
}
/* Setting NULL resets the push constant state so that we allocate a new one
* if needed. If push constant data not dirty, get_push_range_address can
* re-use existing allocation.
*
* Always reallocate on gfx9, gfx11 to fix push constant related flaky tests.
* See https://gitlab.freedesktop.org/mesa/mesa/-/issues/11064
*/
if (gfx_state->base.push_constants_data_dirty || GFX_VER < 12)
gfx_state->base.push_constants_state = ANV_STATE_NULL;
#if GFX_VERx10 >= 125
const struct brw_mesh_prog_data *mesh_prog_data =
get_mesh_prog_data(pipeline);
#endif
anv_foreach_stage(stage, dirty_stages) {
unsigned buffer_count = 0;
flushed |= mesa_to_vk_shader_stage(stage);
UNUSED uint32_t max_push_range = 0;
struct anv_address buffers[4] = {};
if (anv_pipeline_has_stage(pipeline, stage)) {
const struct anv_shader_bin *shader = pipeline->base.shaders[stage];
const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
/* We have to gather buffer addresses as a second step because the
* loop above puts data into the push constant area and the call to
* get_push_range_address is what locks our push constants and copies
* them into the actual GPU buffer. If we did the two loops at the
* same time, we'd risk only having some of the sizes in the push
* constant buffer when we did the copy.
*/
for (unsigned i = 0; i < 4; i++) {
const struct anv_push_range *range = &bind_map->push_ranges[i];
if (range->length == 0)
break;
#if GFX_VERx10 >= 125
/* Padding for Mesh only matters where the platform supports Mesh
* shaders.
*/
if (range->set == ANV_DESCRIPTOR_SET_PER_PRIM_PADDING &&
mesh_prog_data && !mesh_prog_data->map.wa_18019110168_active) {
break;
}
#endif
buffers[i] = get_push_range_address(cmd_buffer, shader, range);
max_push_range = MAX2(max_push_range, range->length);
buffer_count++;
}
/* We have at most 4 buffers but they should be tightly packed */
for (unsigned i = buffer_count; i < 4; i++) {
assert(bind_map->push_ranges[i].length == 0 ||
bind_map->push_ranges[i].set ==
ANV_DESCRIPTOR_SET_PER_PRIM_PADDING);
}
}
#if GFX_VER >= 12
/* If this stage doesn't have any push constants, emit it later in a
* single CONSTANT_ALL packet.
*/
if (buffer_count == 0) {
nobuffer_stages |= 1 << stage;
continue;
}
/* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
* contains only 5 bits, so we can only use it for buffers smaller than
* 32.
*
* According to Wa_16011448509, Gfx12.0 misinterprets some address bits
* in 3DSTATE_CONSTANT_ALL. It should still be safe to use the command
* for disabling stages, where all address bits are zero. However, we
* can't safely use it for general buffers with arbitrary addresses.
* Just fall back to the individual 3DSTATE_CONSTANT_XS commands in that
* case.
*/
if (max_push_range < 32 && GFX_VERx10 > 120) {
cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage,
buffers, buffer_count);
continue;
}
#endif
cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count);
}
#if GFX_VER >= 12
if (nobuffer_stages)
/* Wa_16011448509: all address bits are zero */
cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, NULL, 0);
#endif
cmd_buffer->state.push_constants_dirty &= ~flushed;
gfx_state->base.push_constants_data_dirty = false;
}
#if GFX_VERx10 >= 125
static inline uint64_t
get_mesh_task_push_addr64(struct anv_cmd_buffer *cmd_buffer,
const struct anv_graphics_pipeline *pipeline,
gl_shader_stage stage)
{
struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
const struct anv_shader_bin *shader = pipeline->base.shaders[stage];
const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
if (bind_map->push_ranges[0].length == 0)
return 0;
if (gfx_state->base.push_constants_state.alloc_size == 0) {
gfx_state->base.push_constants_state =
anv_cmd_buffer_gfx_push_constants(cmd_buffer);
}
return anv_address_physical(
anv_address_add(
anv_cmd_buffer_gfx_push_constants_state_address(cmd_buffer,
gfx_state->base.push_constants_state),
bind_map->push_ranges[0].start * 32));
}
static void
cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer *cmd_buffer,
VkShaderStageFlags dirty_stages)
{
struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
const struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(gfx_state->base.pipeline);
if (dirty_stages & VK_SHADER_STAGE_TASK_BIT_EXT &&
anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
uint64_t push_addr64 =
get_mesh_task_push_addr64(cmd_buffer, pipeline, MESA_SHADER_TASK);
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TASK_SHADER_DATA), data) {
data.InlineData[ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET / 4 + 0] = push_addr64 & 0xffffffff;
data.InlineData[ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET / 4 + 1] = push_addr64 >> 32;
}
}
if (dirty_stages & VK_SHADER_STAGE_MESH_BIT_EXT &&
anv_pipeline_has_stage(pipeline, MESA_SHADER_MESH)) {
uint64_t push_addr64 =
get_mesh_task_push_addr64(cmd_buffer, pipeline, MESA_SHADER_MESH);
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_MESH_SHADER_DATA), data) {
data.InlineData[ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET / 4 + 0] = push_addr64 & 0xffffffff;
data.InlineData[ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET / 4 + 1] = push_addr64 >> 32;
data.InlineData[ANV_INLINE_PARAM_MESH_PROVOKING_VERTEX / 4] = gfx_state->dyn_state.mesh_provoking_vertex;
}
}
cmd_buffer->state.push_constants_dirty &= ~dirty_stages;
}
#endif
ALWAYS_INLINE static void
cmd_buffer_maybe_flush_rt_writes(struct anv_cmd_buffer *cmd_buffer,
const struct anv_graphics_pipeline *pipeline)
{
if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
return;
UNUSED bool need_rt_flush = false;
for (uint32_t rt = 0; rt < pipeline->num_color_outputs; rt++) {
/* No writes going to this render target so it won't affect the RT cache
*/
if (pipeline->color_output_mapping[rt] == ANV_COLOR_OUTPUT_UNUSED)
continue;
/* No change */
if (cmd_buffer->state.gfx.color_output_mapping[rt] ==
pipeline->color_output_mapping[rt])
continue;
cmd_buffer->state.gfx.color_output_mapping[rt] =
pipeline->color_output_mapping[rt];
need_rt_flush = true;
cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
}
#if GFX_VER >= 11
if (need_rt_flush) {
/* The PIPE_CONTROL command description says:
*
* "Whenever a Binding Table Index (BTI) used by a Render Target Message
* points to a different RENDER_SURFACE_STATE, SW must issue a Render
* Target Cache Flush by enabling this bit. When render target flush
* is set due to new association of BTI, PS Scoreboard Stall bit must
* be set in this packet."
*
* Within a renderpass, the render target entries in the binding tables
* remain the same as what was setup at CmdBeginRendering() with one
* exception where have to setup a null render target because a fragment
* writes only depth/stencil yet the renderpass has been setup with at
* least one color attachment. This is because our render target messages
* in the shader always send the color.
*/
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
"change RT due to shader outputs");
}
#endif
}
ALWAYS_INLINE static void
cmd_buffer_flush_vertex_buffers(struct anv_cmd_buffer *cmd_buffer,
uint32_t vb_emit)
{
const struct vk_dynamic_graphics_state *dyn =
&cmd_buffer->vk.dynamic_graphics_state;
const uint32_t num_buffers = __builtin_popcount(vb_emit);
const uint32_t num_dwords = 1 + num_buffers * 4;
uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
GENX(3DSTATE_VERTEX_BUFFERS));
uint32_t i = 0;
u_foreach_bit(vb, vb_emit) {
const struct anv_vertex_binding *binding =
&cmd_buffer->state.vertex_bindings[vb];
struct GENX(VERTEX_BUFFER_STATE) state;
if (binding->size > 0) {
uint32_t stride = dyn->vi_binding_strides[vb];
state = (struct GENX(VERTEX_BUFFER_STATE)) {
.VertexBufferIndex = vb,
.MOCS = binding->mocs,
.AddressModifyEnable = true,
.BufferPitch = stride,
.BufferStartingAddress = anv_address_from_u64(binding->addr),
#if GFX_VER >= 12
.L3BypassDisable = true,
#endif
.BufferSize = binding->size,
};
} else {
state = (struct GENX(VERTEX_BUFFER_STATE)) {
.VertexBufferIndex = vb,
.NullVertexBuffer = true,
.MOCS = anv_mocs(cmd_buffer->device, NULL,
ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
};
}
#if GFX_VER == 9
genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb,
state.BufferStartingAddress,
state.BufferSize);
#endif
GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
i++;
}
}
ALWAYS_INLINE static void
cmd_buffer_flush_gfx_state(struct anv_cmd_buffer *cmd_buffer)
{
struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
const struct vk_dynamic_graphics_state *dyn =
&cmd_buffer->vk.dynamic_graphics_state;
assert((pipeline->base.base.active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.base.l3_config);
genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);
genX(flush_descriptor_buffers)(cmd_buffer, &cmd_buffer->state.gfx.base);
genX(flush_pipeline_select_3d)(cmd_buffer);
if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
/* Wa_14015814527
*
* Apply task URB workaround when switching from task to primitive.
*/
if (anv_pipeline_is_primitive(pipeline)) {
genX(apply_task_urb_workaround)(cmd_buffer);
} else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
cmd_buffer->state.gfx.used_task_shader = true;
}
cmd_buffer_maybe_flush_rt_writes(cmd_buffer, pipeline);
}
/* Apply any pending pipeline flushes we may have. We want to apply them
* now because, if any of those flushes are for things like push constants,
* the GPU will read the state at weird times.
*/
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
/* Check what vertex buffers have been rebound against the set of bindings
* being used by the current set of vertex attributes.
*/
uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & dyn->vi->bindings_valid;
/* If the pipeline changed, the we have to consider all the valid bindings. */
if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES))
vb_emit |= dyn->vi->bindings_valid;
if (vb_emit) {
cmd_buffer_flush_vertex_buffers(cmd_buffer, vb_emit);
cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;
}
const bool any_dynamic_state_dirty =
vk_dynamic_graphics_state_any_dirty(dyn);
uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty &
pipeline->base.base.active_stages;
descriptors_dirty |=
genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
&cmd_buffer->state.gfx.base,
&pipeline->base.base);
if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty &&
!any_dynamic_state_dirty &&
((cmd_buffer->state.push_constants_dirty &
(VK_SHADER_STAGE_ALL_GRAPHICS |
VK_SHADER_STAGE_TASK_BIT_EXT |
VK_SHADER_STAGE_MESH_BIT_EXT)) == 0))
return;
if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) {
/* Wa_16011411144:
*
* SW must insert a PIPE_CONTROL cmd before and after the
* 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
* state is not combined with other state changes.
*/
if (intel_needs_workaround(cmd_buffer->device->info, 16011411144)) {
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_CS_STALL_BIT,
"before SO_BUFFER change WA");
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
}
/* We don't need any per-buffer dirty tracking because you're not
* allowed to bind different XFB buffers while XFB is enabled.
*/
for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
#if GFX_VER < 12
sob.SOBufferIndex = idx;
#else
sob._3DCommandOpcode = 0;
sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx;
#endif
if (cmd_buffer->state.xfb_enabled &&
xfb->addr != 0 && xfb->size != 0) {
sob.MOCS = xfb->mocs;
sob.SurfaceBaseAddress = anv_address_from_u64(xfb->addr);
sob.SOBufferEnable = true;
sob.StreamOffsetWriteEnable = false;
/* Size is in DWords - 1 */
sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1;
} else {
sob.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
}
}
}
if (intel_needs_workaround(cmd_buffer->device->info, 16011411144)) {
/* Wa_16011411144: also CS_STALL after touching SO_BUFFER change */
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_CS_STALL_BIT,
"after SO_BUFFER change WA");
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
} else if (GFX_VER >= 10) {
/* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_CS_STALL_BIT,
"after 3DSTATE_SO_BUFFER call");
}
}
/* Flush the runtime state into the HW state tracking */
if (cmd_buffer->state.gfx.dirty || any_dynamic_state_dirty)
genX(cmd_buffer_flush_gfx_runtime_state)(cmd_buffer);
/* Flush the HW state into the commmand buffer */
if (!BITSET_IS_EMPTY(cmd_buffer->state.gfx.dyn_state.dirty))
genX(cmd_buffer_flush_gfx_hw_state)(cmd_buffer);
/* If the pipeline changed, we may need to re-allocate push constant space
* in the URB.
*/
if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
cmd_buffer_alloc_gfx_push_constants(cmd_buffer);
/* Also add the relocations (scratch buffers) */
VkResult result = anv_reloc_list_append(cmd_buffer->batch.relocs,
pipeline->base.base.batch.relocs);
if (result != VK_SUCCESS) {
anv_batch_set_error(&cmd_buffer->batch, result);
return;
}
}
/* Render targets live in the same binding table as fragment descriptors */
if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
/* We emit the binding tables and sampler tables first, then emit push
* constants and then finally emit binding table and sampler table
* pointers. It has to happen in this order, since emitting the binding
* tables may change the push constants (in case of storage images). After
* emitting push constants, on SKL+ we have to emit the corresponding
* 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
*/
uint32_t dirty = 0;
if (descriptors_dirty) {
dirty = genX(cmd_buffer_flush_descriptor_sets)(
cmd_buffer,
&cmd_buffer->state.gfx.base,
descriptors_dirty,
pipeline->base.shaders,
ARRAY_SIZE(pipeline->base.shaders));
cmd_buffer->state.descriptors_dirty &= ~dirty;
}
if (dirty || cmd_buffer->state.push_constants_dirty) {
/* Because we're pushing UBOs, we have to push whenever either
* descriptors or push constants is dirty.
*/
dirty |= cmd_buffer->state.push_constants_dirty &
pipeline->base.base.active_stages;
#if INTEL_NEEDS_WA_1604061319
/* Testing shows that all the 3DSTATE_CONSTANT_XS need to be emitted if
* any stage has 3DSTATE_CONSTANT_XS emitted.
*/
dirty |= pipeline->base.base.active_stages;
#endif
cmd_buffer_flush_gfx_push_constants(cmd_buffer,
dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
#if GFX_VERx10 >= 125
cmd_buffer_flush_mesh_inline_data(
cmd_buffer, dirty & (VK_SHADER_STAGE_TASK_BIT_EXT |
VK_SHADER_STAGE_MESH_BIT_EXT));
#endif
}
if (dirty & VK_SHADER_STAGE_ALL_GRAPHICS) {
cmd_buffer_emit_descriptor_pointers(cmd_buffer,
dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
}
#if GFX_VER >= 20
if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_INDIRECT_DATA_STRIDE) {
anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BYTE_STRIDE), sb_stride) {
sb_stride.ByteStride = cmd_buffer->state.gfx.indirect_data_stride;
sb_stride.ByteStrideEnable = !cmd_buffer->state.gfx.indirect_data_stride_aligned;
}
}
#endif
cmd_buffer->state.gfx.dirty = 0;
}
void
genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
{
cmd_buffer_flush_gfx_state(cmd_buffer);
}
ALWAYS_INLINE static bool
anv_use_generated_draws(const struct anv_cmd_buffer *cmd_buffer, uint32_t count)
{
const struct anv_device *device = cmd_buffer->device;
const struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
/* We cannot generate readable commands in protected mode. */
if (cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
return false;
/* Limit generated draws to pipelines without HS stage. This makes things
* simpler for implementing Wa_1306463417, Wa_16011107343.
*/
if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL))
return false;
return count >= device->physical->instance->generated_indirect_threshold;
}
#include "genX_cmd_draw_helpers.h"
#include "genX_cmd_draw_generated_indirect.h"
ALWAYS_INLINE static void
cmd_buffer_pre_draw_wa(struct anv_cmd_buffer *cmd_buffer)
{
UNUSED const bool protected = cmd_buffer->vk.pool->flags &
VK_COMMAND_POOL_CREATE_PROTECTED_BIT;
UNUSED struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
UNUSED struct anv_device *device = cmd_buffer->device;
UNUSED struct anv_instance *instance = device->physical->instance;
#define DEBUG_SHADER_HASH(stage) do { \
if (unlikely( \
(instance->debug & ANV_DEBUG_SHADER_HASH) && \
anv_pipeline_has_stage(pipeline, stage))) { \
mi_store(&b, \
mi_mem32(device->workaround_address), \
mi_imm(pipeline->base.shaders[stage]-> \
prog_data->source_hash)); \
} \
} while (0)
struct mi_builder b;
if (unlikely(instance->debug & ANV_DEBUG_SHADER_HASH)) {
mi_builder_init(&b, device->info, &cmd_buffer->batch);
mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
}
#if INTEL_WA_16011107343_GFX_VER
if (intel_needs_workaround(cmd_buffer->device->info, 16011107343) &&
anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL)) {
DEBUG_SHADER_HASH(MESA_SHADER_TESS_CTRL);
anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
final.hs, protected);
}
#endif
#if INTEL_WA_22018402687_GFX_VER
if (intel_needs_workaround(cmd_buffer->device->info, 22018402687) &&
anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
DEBUG_SHADER_HASH(MESA_SHADER_TESS_EVAL);
/* Wa_22018402687:
* In any 3D enabled context, just before any Tessellation enabled
* draw call (3D Primitive), re-send the last programmed 3DSTATE_DS
* again. This will make sure that the 3DSTATE_INT generated just
* before the draw call will have TDS dirty which will make sure TDS
* will launch the state thread before the draw call.
*
* This fixes a hang resulting from running anything using tessellation
* after a switch away from the mesh pipeline. We don't need to track
* said switch, as it matters at the HW level, and can be triggered even
* across processes, so we apply the Wa at all times.
*/
anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
final.ds, protected);
}
#endif
genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
#undef DEBUG_SHADER_HASH
}
ALWAYS_INLINE static void
batch_post_draw_wa(struct anv_batch *batch,
const struct anv_device *device,
uint32_t primitive_topology,
uint32_t vertex_count)
{
#if INTEL_WA_22014412737_GFX_VER || INTEL_WA_16014538804_GFX_VER
if (intel_needs_workaround(device->info, 22014412737) &&
(primitive_topology == _3DPRIM_POINTLIST ||
primitive_topology == _3DPRIM_LINELIST ||
primitive_topology == _3DPRIM_LINESTRIP ||
primitive_topology == _3DPRIM_LINELIST_ADJ ||
primitive_topology == _3DPRIM_LINESTRIP_ADJ ||
primitive_topology == _3DPRIM_LINELOOP ||
primitive_topology == _3DPRIM_POINTLIST_BF ||
primitive_topology == _3DPRIM_LINESTRIP_CONT ||
primitive_topology == _3DPRIM_LINESTRIP_BF ||
primitive_topology == _3DPRIM_LINESTRIP_CONT_BF) &&
(vertex_count == 1 || vertex_count == 2)) {
genx_batch_emit_pipe_control_write
(batch, device->info, 0, WriteImmediateData,
device->workaround_address, 0, 0);
/* Reset counter because we just emitted a PC */
batch->num_3d_primitives_emitted = 0;
} else if (intel_needs_workaround(device->info, 16014538804)) {
batch->num_3d_primitives_emitted++;
/* WA 16014538804:
* After every 3 3D_Primitive command,
* atleast 1 pipe_control must be inserted.
*/
if (batch->num_3d_primitives_emitted == 3) {
anv_batch_emit(batch, GENX(PIPE_CONTROL), pc);
batch->num_3d_primitives_emitted = 0;
}
}
#endif
}
void
genX(batch_emit_post_3dprimitive_was)(struct anv_batch *batch,
const struct anv_device *device,
uint32_t primitive_topology,
uint32_t vertex_count)
{
batch_post_draw_wa(batch, device, primitive_topology, vertex_count);
}
ALWAYS_INLINE static void
cmd_buffer_post_draw_wa(struct anv_cmd_buffer *cmd_buffer,
uint32_t vertex_count,
uint32_t access_type)
{
batch_post_draw_wa(&cmd_buffer->batch, cmd_buffer->device,
cmd_buffer->state.gfx.dyn_state.vft.PrimitiveTopologyType,
vertex_count);
update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, access_type);
genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
}
#if GFX_VER >= 11
#define _3DPRIMITIVE_DIRECT GENX(3DPRIMITIVE_EXTENDED)
#else
#define _3DPRIMITIVE_DIRECT GENX(3DPRIMITIVE)
#endif
void genX(CmdDraw)(
VkCommandBuffer commandBuffer,
uint32_t vertexCount,
uint32_t instanceCount,
uint32_t firstVertex,
uint32_t firstInstance)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
if (anv_batch_has_error(&cmd_buffer->batch))
return;
const uint32_t count =
vertexCount * instanceCount * pipeline->instance_multiplier;
anv_measure_snapshot(cmd_buffer,
INTEL_SNAPSHOT_DRAW,
"draw", count);
trace_intel_begin_draw(&cmd_buffer->trace);
/* Select pipeline here to allow
* cmd_buffer_emit_vertex_constants_and_flush() without flushing before
* cmd_buffer_flush_gfx_state().
*/
genX(flush_pipeline_select_3d)(cmd_buffer);
#if GFX_VER < 11
cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer,
get_vs_prog_data(pipeline),
firstVertex, firstInstance, 0,
false /* force_flush */);
#endif
cmd_buffer_flush_gfx_state(cmd_buffer);
if (cmd_buffer->state.conditional_render_enabled)
genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
cmd_buffer_pre_draw_wa(cmd_buffer);
anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
#if GFX_VERx10 >= 125
prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
#endif
prim.VertexAccessType = SEQUENTIAL;
prim.VertexCountPerInstance = vertexCount;
prim.StartVertexLocation = firstVertex;
prim.InstanceCount = instanceCount *
pipeline->instance_multiplier;
prim.StartInstanceLocation = firstInstance;
prim.BaseVertexLocation = 0;
#if GFX_VER >= 11
prim.ExtendedParametersPresent = true;
prim.ExtendedParameter0 = firstVertex;
prim.ExtendedParameter1 = firstInstance;
prim.ExtendedParameter2 = 0;
#endif
}
cmd_buffer_post_draw_wa(cmd_buffer, vertexCount, SEQUENTIAL);
trace_intel_end_draw(&cmd_buffer->trace, count,
pipeline->vs_source_hash,
pipeline->fs_source_hash);
}
void genX(CmdDrawMultiEXT)(
VkCommandBuffer commandBuffer,
uint32_t drawCount,
const VkMultiDrawInfoEXT *pVertexInfo,
uint32_t instanceCount,
uint32_t firstInstance,
uint32_t stride)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
UNUSED struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
if (anv_batch_has_error(&cmd_buffer->batch))
return;
cmd_buffer_flush_gfx_state(cmd_buffer);
if (cmd_buffer->state.conditional_render_enabled)
genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
uint32_t i = 0;
#if GFX_VER < 11
vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer,
get_vs_prog_data(pipeline),
draw->firstVertex,
firstInstance, i, !i);
const uint32_t count =
draw->vertexCount * instanceCount * pipeline->instance_multiplier;
anv_measure_snapshot(cmd_buffer,
INTEL_SNAPSHOT_DRAW,
"draw multi", count);
trace_intel_begin_draw_multi(&cmd_buffer->trace);
cmd_buffer_pre_draw_wa(cmd_buffer);
anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
prim.VertexAccessType = SEQUENTIAL;
prim.VertexCountPerInstance = draw->vertexCount;
prim.StartVertexLocation = draw->firstVertex;
prim.InstanceCount = instanceCount *
pipeline->instance_multiplier;
prim.StartInstanceLocation = firstInstance;
prim.BaseVertexLocation = 0;
}
cmd_buffer_post_draw_wa(cmd_buffer, drawCount == 0 ? 0 :
pVertexInfo[drawCount - 1].vertexCount,
SEQUENTIAL);
trace_intel_end_draw_multi(&cmd_buffer->trace, count,
pipeline->vs_source_hash,
pipeline->fs_source_hash);
}
#else
vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
const uint32_t count = draw->vertexCount * instanceCount;
anv_measure_snapshot(cmd_buffer,
INTEL_SNAPSHOT_DRAW,
"draw multi", count);
trace_intel_begin_draw_multi(&cmd_buffer->trace);
cmd_buffer_pre_draw_wa(cmd_buffer);
anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
#if GFX_VERx10 >= 125
prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
#endif
prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
prim.VertexAccessType = SEQUENTIAL;
prim.VertexCountPerInstance = draw->vertexCount;
prim.StartVertexLocation = draw->firstVertex;
prim.InstanceCount = instanceCount *
pipeline->instance_multiplier;
prim.StartInstanceLocation = firstInstance;
prim.BaseVertexLocation = 0;
prim.ExtendedParametersPresent = true;
prim.ExtendedParameter0 = draw->firstVertex;
prim.ExtendedParameter1 = firstInstance;
prim.ExtendedParameter2 = i;
}
cmd_buffer_post_draw_wa(cmd_buffer, drawCount == 0 ? 0 :
pVertexInfo[drawCount - 1].vertexCount,
SEQUENTIAL);
trace_intel_end_draw_multi(&cmd_buffer->trace, count,
pipeline->vs_source_hash,
pipeline->fs_source_hash);
}
#endif
}
void genX(CmdDrawIndexed)(
VkCommandBuffer commandBuffer,
uint32_t indexCount,
uint32_t instanceCount,
uint32_t firstIndex,
int32_t vertexOffset,
uint32_t firstInstance)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
if (anv_batch_has_error(&cmd_buffer->batch))
return;
const uint32_t count =
indexCount * instanceCount * pipeline->instance_multiplier;
anv_measure_snapshot(cmd_buffer,
INTEL_SNAPSHOT_DRAW,
"draw indexed",
count);
trace_intel_begin_draw_indexed(&cmd_buffer->trace);
/* Select pipeline here to allow
* cmd_buffer_emit_vertex_constants_and_flush() without flushing before
* cmd_buffer_flush_gfx_state().
*/
genX(flush_pipeline_select_3d)(cmd_buffer);
#if GFX_VER < 11
const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
vertexOffset, firstInstance,
0, false /* force_flush */);
#endif
cmd_buffer_flush_gfx_state(cmd_buffer);
if (cmd_buffer->state.conditional_render_enabled)
genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
cmd_buffer_pre_draw_wa(cmd_buffer);
anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
#if GFX_VERx10 >= 125
prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
#endif
prim.VertexAccessType = RANDOM;
prim.VertexCountPerInstance = indexCount;
prim.StartVertexLocation = firstIndex;
prim.InstanceCount = instanceCount *
pipeline->instance_multiplier;
prim.StartInstanceLocation = firstInstance;
prim.BaseVertexLocation = vertexOffset;
#if GFX_VER >= 11
prim.ExtendedParametersPresent = true;
prim.ExtendedParameter0 = vertexOffset;
prim.ExtendedParameter1 = firstInstance;
prim.ExtendedParameter2 = 0;
#endif
}
cmd_buffer_post_draw_wa(cmd_buffer, indexCount, RANDOM);
trace_intel_end_draw_indexed(&cmd_buffer->trace, count,
pipeline->vs_source_hash,
pipeline->fs_source_hash);
}
void genX(CmdDrawMultiIndexedEXT)(
VkCommandBuffer commandBuffer,
uint32_t drawCount,
const VkMultiDrawIndexedInfoEXT *pIndexInfo,
uint32_t instanceCount,
uint32_t firstInstance,
uint32_t stride,
const int32_t *pVertexOffset)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
if (anv_batch_has_error(&cmd_buffer->batch))
return;
cmd_buffer_flush_gfx_state(cmd_buffer);
if (cmd_buffer->state.conditional_render_enabled)
genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
uint32_t i = 0;
#if GFX_VER < 11
const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
if (pVertexOffset) {
if (vs_prog_data->uses_drawid) {
bool emitted = true;
if (vs_prog_data->uses_firstvertex ||
vs_prog_data->uses_baseinstance) {
emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
emitted = true;
}
vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
if (vs_prog_data->uses_drawid) {
emit_draw_index(cmd_buffer, i);
emitted = true;
}
/* Emitting draw index or vertex index BOs may result in needing
* additional VF cache flushes.
*/
if (emitted)
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
const uint32_t count =
draw->indexCount * instanceCount * pipeline->instance_multiplier;
anv_measure_snapshot(cmd_buffer,
INTEL_SNAPSHOT_DRAW,
"draw indexed multi",
count);
trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
cmd_buffer_pre_draw_wa(cmd_buffer);
anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
prim.VertexAccessType = RANDOM;
prim.VertexCountPerInstance = draw->indexCount;
prim.StartVertexLocation = draw->firstIndex;
prim.InstanceCount = instanceCount *
pipeline->instance_multiplier;
prim.StartInstanceLocation = firstInstance;
prim.BaseVertexLocation = *pVertexOffset;
}
cmd_buffer_post_draw_wa(cmd_buffer, drawCount == 0 ? 0 :
pIndexInfo[drawCount - 1].indexCount,
RANDOM);
trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count,
pipeline->vs_source_hash,
pipeline->fs_source_hash);
emitted = false;
}
} else {
if (vs_prog_data->uses_firstvertex ||
vs_prog_data->uses_baseinstance) {
emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
/* Emitting draw index or vertex index BOs may result in needing
* additional VF cache flushes.
*/
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
}
vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
const uint32_t count =
draw->indexCount * instanceCount * pipeline->instance_multiplier;
anv_measure_snapshot(cmd_buffer,
INTEL_SNAPSHOT_DRAW,
"draw indexed multi",
count);
trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
cmd_buffer_pre_draw_wa(cmd_buffer);
anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
prim.VertexAccessType = RANDOM;
prim.VertexCountPerInstance = draw->indexCount;
prim.StartVertexLocation = draw->firstIndex;
prim.InstanceCount = instanceCount *
pipeline->instance_multiplier;
prim.StartInstanceLocation = firstInstance;
prim.BaseVertexLocation = *pVertexOffset;
}
cmd_buffer_post_draw_wa(cmd_buffer, drawCount == 0 ? 0 :
pIndexInfo[drawCount - 1].indexCount,
RANDOM);
trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count,
pipeline->vs_source_hash,
pipeline->fs_source_hash);
}
}
} else {
vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
draw->vertexOffset,
firstInstance, i, i != 0);
const uint32_t count =
draw->indexCount * instanceCount * pipeline->instance_multiplier;
anv_measure_snapshot(cmd_buffer,
INTEL_SNAPSHOT_DRAW,
"draw indexed multi",
count);
trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
cmd_buffer_pre_draw_wa(cmd_buffer);
anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
prim.VertexAccessType = RANDOM;
prim.VertexCountPerInstance = draw->indexCount;
prim.StartVertexLocation = draw->firstIndex;
prim.InstanceCount = instanceCount *
pipeline->instance_multiplier;
prim.StartInstanceLocation = firstInstance;
prim.BaseVertexLocation = draw->vertexOffset;
}
cmd_buffer_post_draw_wa(cmd_buffer, drawCount == 0 ? 0 :
pIndexInfo[drawCount - 1].indexCount,
RANDOM);
trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count,
pipeline->vs_source_hash,
pipeline->fs_source_hash);
}
}
#else
vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
const uint32_t count =
draw->indexCount * instanceCount * pipeline->instance_multiplier;
anv_measure_snapshot(cmd_buffer,
INTEL_SNAPSHOT_DRAW,
"draw indexed multi",
count);
trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
cmd_buffer_pre_draw_wa(cmd_buffer);
anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE_EXTENDED), prim) {
#if GFX_VERx10 >= 125
prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
#endif
prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
prim.VertexAccessType = RANDOM;
prim.VertexCountPerInstance = draw->indexCount;
prim.StartVertexLocation = draw->firstIndex;
prim.InstanceCount = instanceCount *
pipeline->instance_multiplier;
prim.StartInstanceLocation = firstInstance;
prim.BaseVertexLocation = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
prim.ExtendedParametersPresent = true;
prim.ExtendedParameter0 = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
prim.ExtendedParameter1 = firstInstance;
prim.ExtendedParameter2 = i;
}
cmd_buffer_post_draw_wa(cmd_buffer, drawCount == 0 ? 0 :
pIndexInfo[drawCount - 1].indexCount,
RANDOM);
trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count,
pipeline->vs_source_hash,
pipeline->fs_source_hash);
}
#endif
}
/* Auto-Draw / Indirect Registers */
#define GFX7_3DPRIM_END_OFFSET 0x2420
#define GFX7_3DPRIM_START_VERTEX 0x2430
#define GFX7_3DPRIM_VERTEX_COUNT 0x2434
#define GFX7_3DPRIM_INSTANCE_COUNT 0x2438
#define GFX7_3DPRIM_START_INSTANCE 0x243C
#define GFX7_3DPRIM_BASE_VERTEX 0x2440
/* On Gen11+, we have three custom "extended parameters" which we can use to
* provide extra system-generated values to shaders. Our assignment of these
* is arbitrary; we choose to assign them as follows:
*
* gl_BaseVertex = XP0
* gl_BaseInstance = XP1
* gl_DrawID = XP2
*
* For gl_BaseInstance, we never actually have to set up the value because we
* can just program 3DSTATE_VF_SGVS_2 to load it implicitly. We can also do
* that for gl_BaseVertex but it does the wrong thing for indexed draws.
*/
#define GEN11_3DPRIM_XP0 0x2690
#define GEN11_3DPRIM_XP1 0x2694
#define GEN11_3DPRIM_XP2 0x2698
#define GEN11_3DPRIM_XP_BASE_VERTEX GEN11_3DPRIM_XP0
#define GEN11_3DPRIM_XP_BASE_INSTANCE GEN11_3DPRIM_XP1
#define GEN11_3DPRIM_XP_DRAW_ID GEN11_3DPRIM_XP2
void genX(CmdDrawIndirectByteCountEXT)(
VkCommandBuffer commandBuffer,
uint32_t instanceCount,
uint32_t firstInstance,
VkBuffer counterBuffer,
VkDeviceSize counterBufferOffset,
uint32_t counterOffset,
uint32_t vertexStride)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer);
struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
/* firstVertex is always zero for this draw function */
const uint32_t firstVertex = 0;
if (anv_batch_has_error(&cmd_buffer->batch))
return;
anv_measure_snapshot(cmd_buffer,
INTEL_SNAPSHOT_DRAW,
"draw indirect byte count",
instanceCount * pipeline->instance_multiplier);
trace_intel_begin_draw_indirect_byte_count(&cmd_buffer->trace);
/* Select pipeline here to allow
* cmd_buffer_emit_vertex_constants_and_flush() without flushing before
* emit_base_vertex_instance() & emit_draw_index().
*/
genX(flush_pipeline_select_3d)(cmd_buffer);
#if GFX_VER < 11
const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
if (vs_prog_data->uses_firstvertex ||
vs_prog_data->uses_baseinstance)
emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
if (vs_prog_data->uses_drawid)
emit_draw_index(cmd_buffer, 0);
#endif
cmd_buffer_flush_gfx_state(cmd_buffer);
if (cmd_buffer->state.conditional_render_enabled)
genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
struct mi_builder b;
mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &counter_buffer->address);
mi_builder_set_mocs(&b, mocs);
struct mi_value count =
mi_mem32(anv_address_add(counter_buffer->address,
counterBufferOffset));
if (counterOffset)
count = mi_isub(&b, count, mi_imm(counterOffset));
count = mi_udiv32_imm(&b, count, vertexStride);
mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count);
mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex));
mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT),
mi_imm(instanceCount * pipeline->instance_multiplier));
mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance));
mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
#if GFX_VER >= 11
mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
mi_imm(firstVertex));
/* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_DRAW_ID), mi_imm(0));
#endif
cmd_buffer_pre_draw_wa(cmd_buffer);
anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
#if GFX_VERx10 >= 125
prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
#endif
prim.IndirectParameterEnable = true;
prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
prim.VertexAccessType = SEQUENTIAL;
#if GFX_VER >= 11
prim.ExtendedParametersPresent = true;
#endif
}
cmd_buffer_post_draw_wa(cmd_buffer, 1, SEQUENTIAL);
trace_intel_end_draw_indirect_byte_count(&cmd_buffer->trace,
instanceCount * pipeline->instance_multiplier,
pipeline->vs_source_hash,
pipeline->fs_source_hash);
}
static void
load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
struct anv_address addr,
bool indexed,
uint32_t draw_id)
{
struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
struct mi_builder b;
mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &addr);
mi_builder_set_mocs(&b, mocs);
mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT),
mi_mem32(anv_address_add(addr, 0)));
struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4));
if (pipeline->instance_multiplier > 1) {
instance_count = mi_imul_imm(&b, instance_count,
pipeline->instance_multiplier);
}
mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count);
mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX),
mi_mem32(anv_address_add(addr, 8)));
if (indexed) {
mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX),
mi_mem32(anv_address_add(addr, 12)));
mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
mi_mem32(anv_address_add(addr, 16)));
#if GFX_VER >= 11
mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
mi_mem32(anv_address_add(addr, 12)));
/* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
#endif
} else {
mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
mi_mem32(anv_address_add(addr, 12)));
mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
#if GFX_VER >= 11
mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
mi_mem32(anv_address_add(addr, 8)));
/* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
#endif
}
#if GFX_VER >= 11
mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_DRAW_ID),
mi_imm(draw_id));
#endif
}
static inline bool
execute_indirect_draw_supported(const struct anv_cmd_buffer *cmd_buffer)
{
#if GFX_VERx10 >= 125
const struct intel_device_info *devinfo = cmd_buffer->device->info;
if (!devinfo->has_indirect_unroll)
return false;
struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
const bool is_multiview = pipeline->instance_multiplier > 1;
const bool uses_draw_id =
(vs_prog_data && vs_prog_data->uses_drawid) ||
(mesh_prog_data && mesh_prog_data->uses_drawid) ||
(task_prog_data && task_prog_data->uses_drawid);
const bool uses_firstvertex =
(vs_prog_data && vs_prog_data->uses_firstvertex);
const bool uses_baseinstance =
(vs_prog_data && vs_prog_data->uses_baseinstance);
return !is_multiview &&
!uses_draw_id &&
!uses_firstvertex &&
!uses_baseinstance;
#else
return false;
#endif
}
static void
emit_indirect_draws(struct anv_cmd_buffer *cmd_buffer,
struct anv_address indirect_data_addr,
uint32_t indirect_data_stride,
uint32_t draw_count,
bool indexed)
{
#if GFX_VER < 11
struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
#endif
cmd_buffer_flush_gfx_state(cmd_buffer);
if (cmd_buffer->state.conditional_render_enabled)
genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
uint32_t offset = 0;
for (uint32_t i = 0; i < draw_count; i++) {
struct anv_address draw = anv_address_add(indirect_data_addr, offset);
#if GFX_VER < 11
/* TODO: We need to stomp base vertex to 0 somehow */
/* With sequential draws, we're dealing with the VkDrawIndirectCommand
* structure data. We want to load VkDrawIndirectCommand::firstVertex at
* offset 8 in the structure.
*
* With indexed draws, we're dealing with VkDrawIndexedIndirectCommand.
* We want the VkDrawIndirectCommand::vertexOffset field at offset 12 in
* the structure.
*/
if (vs_prog_data->uses_firstvertex ||
vs_prog_data->uses_baseinstance) {
emit_base_vertex_instance_bo(cmd_buffer,
anv_address_add(draw, indexed ? 12 : 8));
}
if (vs_prog_data->uses_drawid)
emit_draw_index(cmd_buffer, i);
#endif
/* Emitting draw index or vertex index BOs may result in needing
* additional VF cache flushes.
*/
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
load_indirect_parameters(cmd_buffer, draw, indexed, i);
cmd_buffer_pre_draw_wa(cmd_buffer);
anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
#if GFX_VERx10 >= 125
prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
#endif
prim.IndirectParameterEnable = true;
prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
prim.VertexAccessType = indexed ? RANDOM : SEQUENTIAL;
#if GFX_VER >= 11
prim.ExtendedParametersPresent = true;
#endif
}
cmd_buffer_post_draw_wa(cmd_buffer, 1, indexed ? RANDOM : SEQUENTIAL);
offset += indirect_data_stride;
}
}
static inline uint32_t xi_argument_format_for_vk_cmd(enum vk_cmd_type cmd)
{
#if GFX_VERx10 >= 125
switch (cmd) {
case VK_CMD_DRAW_INDIRECT:
case VK_CMD_DRAW_INDIRECT_COUNT:
return XI_DRAW;
case VK_CMD_DRAW_INDEXED_INDIRECT:
case VK_CMD_DRAW_INDEXED_INDIRECT_COUNT:
return XI_DRAWINDEXED;
case VK_CMD_DRAW_MESH_TASKS_INDIRECT_EXT:
case VK_CMD_DRAW_MESH_TASKS_INDIRECT_COUNT_EXT:
return XI_MESH_3D;
default:
unreachable("unhandled cmd type");
}
#else
unreachable("unsupported GFX VER");
#endif
}
static inline bool
cmd_buffer_set_indirect_stride(struct anv_cmd_buffer *cmd_buffer,
uint32_t stride, enum vk_cmd_type cmd)
{
/* Should have been sanitized by the caller */
assert(stride != 0);
uint32_t data_stride = 0;
switch (cmd) {
case VK_CMD_DRAW_INDIRECT:
case VK_CMD_DRAW_INDIRECT_COUNT:
data_stride = sizeof(VkDrawIndirectCommand);
break;
case VK_CMD_DRAW_INDEXED_INDIRECT:
case VK_CMD_DRAW_INDEXED_INDIRECT_COUNT:
data_stride = sizeof(VkDrawIndexedIndirectCommand);
break;
case VK_CMD_DRAW_MESH_TASKS_INDIRECT_EXT:
case VK_CMD_DRAW_MESH_TASKS_INDIRECT_COUNT_EXT:
data_stride = sizeof(VkDrawMeshTasksIndirectCommandEXT);
break;
default:
unreachable("unhandled cmd type");
}
bool aligned = stride == data_stride;
#if GFX_VER >= 20
/* The stride can change as long as it matches the default command stride
* and STATE_BYTE_STRIDE::ByteStrideEnable=false, we can just do nothing.
*
* Otheriwse STATE_BYTE_STRIDE::ByteStrideEnable=true, any stride change
* should be signaled.
*/
struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
if (gfx_state->indirect_data_stride_aligned != aligned) {
gfx_state->indirect_data_stride = stride;
gfx_state->indirect_data_stride_aligned = aligned;
gfx_state->dirty |= ANV_CMD_DIRTY_INDIRECT_DATA_STRIDE;
} else if (!gfx_state->indirect_data_stride_aligned &&
gfx_state->indirect_data_stride != stride) {
gfx_state->indirect_data_stride = stride;
gfx_state->indirect_data_stride_aligned = aligned;
gfx_state->dirty |= ANV_CMD_DIRTY_INDIRECT_DATA_STRIDE;
}
#endif
return aligned;
}
static void
genX(cmd_buffer_emit_execute_indirect_draws)(struct anv_cmd_buffer *cmd_buffer,
struct anv_address indirect_data_addr,
uint32_t indirect_data_stride,
struct anv_address count_addr,
uint32_t max_draw_count,
enum vk_cmd_type cmd)
{
#if GFX_VERx10 >= 125
bool aligned_stride =
cmd_buffer_set_indirect_stride(cmd_buffer, indirect_data_stride, cmd);
genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
if (cmd_buffer->state.conditional_render_enabled)
genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
uint32_t offset = 0;
for (uint32_t i = 0; i < max_draw_count; i++) {
struct anv_address draw = anv_address_add(indirect_data_addr, offset);
cmd_buffer_pre_draw_wa(cmd_buffer);
anv_batch_emit(&cmd_buffer->batch, GENX(EXECUTE_INDIRECT_DRAW), ind) {
ind.ArgumentFormat = xi_argument_format_for_vk_cmd(cmd);
ind.TBIMREnabled = cmd_buffer->state.gfx.dyn_state.use_tbimr;
ind.PredicateEnable =
cmd_buffer->state.conditional_render_enabled;
ind.MaxCount = aligned_stride ? max_draw_count : 1;
ind.ArgumentBufferStartAddress = draw;
ind.CountBufferAddress = count_addr;
ind.CountBufferIndirectEnable = !anv_address_is_null(count_addr);
ind.MOCS =
anv_mocs(cmd_buffer->device, draw.bo, 0);
}
cmd_buffer_post_draw_wa(cmd_buffer, 1,
0 /* Doesn't matter for GFX_VER > 9 */);
/* If all the indirect structures are aligned, then we can let the HW
* do the unrolling and we only need one instruction. Otherwise we
* need to emit one instruction per draw, but we're still avoiding
* the register loads with MI commands.
*/
if (aligned_stride || GFX_VER >= 20)
break;
offset += indirect_data_stride;
}
#endif // GFX_VERx10 >= 125
}
void genX(CmdDrawIndirect)(
VkCommandBuffer commandBuffer,
VkBuffer _buffer,
VkDeviceSize offset,
uint32_t drawCount,
uint32_t stride)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
if (anv_batch_has_error(&cmd_buffer->batch))
return;
anv_measure_snapshot(cmd_buffer,
INTEL_SNAPSHOT_DRAW,
"draw indirect",
drawCount);
trace_intel_begin_draw_indirect(&cmd_buffer->trace);
struct anv_address indirect_data_addr =
anv_address_add(buffer->address, offset);
stride = MAX2(stride, sizeof(VkDrawIndirectCommand));
if (execute_indirect_draw_supported(cmd_buffer)) {
genX(cmd_buffer_emit_execute_indirect_draws)(
cmd_buffer,
indirect_data_addr,
stride,
ANV_NULL_ADDRESS /* count_addr */,
drawCount,
VK_CMD_DRAW_INDIRECT);
} else if (anv_use_generated_draws(cmd_buffer, drawCount)) {
genX(cmd_buffer_emit_indirect_generated_draws)(
cmd_buffer,
indirect_data_addr,
stride,
ANV_NULL_ADDRESS /* count_addr */,
drawCount,
false /* indexed */);
} else {
emit_indirect_draws(cmd_buffer,
indirect_data_addr,
stride, drawCount, false /* indexed */);
}
trace_intel_end_draw_indirect(&cmd_buffer->trace, drawCount,
pipeline->vs_source_hash,
pipeline->fs_source_hash);
}
void genX(CmdDrawIndexedIndirect)(
VkCommandBuffer commandBuffer,
VkBuffer _buffer,
VkDeviceSize offset,
uint32_t drawCount,
uint32_t stride)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
if (anv_batch_has_error(&cmd_buffer->batch))
return;
anv_measure_snapshot(cmd_buffer,
INTEL_SNAPSHOT_DRAW,
"draw indexed indirect",
drawCount);
trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace);
struct anv_address indirect_data_addr =
anv_address_add(buffer->address, offset);
stride = MAX2(stride, sizeof(VkDrawIndexedIndirectCommand));
if (execute_indirect_draw_supported(cmd_buffer)) {
genX(cmd_buffer_emit_execute_indirect_draws)(
cmd_buffer,
indirect_data_addr,
stride,
ANV_NULL_ADDRESS /* count_addr */,
drawCount,
VK_CMD_DRAW_INDEXED_INDIRECT);
} else if (anv_use_generated_draws(cmd_buffer, drawCount)) {
genX(cmd_buffer_emit_indirect_generated_draws)(
cmd_buffer,
indirect_data_addr,
stride,
ANV_NULL_ADDRESS /* count_addr */,
drawCount,
true /* indexed */);
} else {
emit_indirect_draws(cmd_buffer,
indirect_data_addr,
stride, drawCount, true /* indexed */);
}
trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, drawCount,
pipeline->vs_source_hash,
pipeline->fs_source_hash);
}
#define MI_PREDICATE_SRC0 0x2400
#define MI_PREDICATE_SRC1 0x2408
#define MI_PREDICATE_RESULT 0x2418
static struct mi_value
prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
struct mi_builder *b,
struct anv_address count_address)
{
struct mi_value ret = mi_imm(0);
if (cmd_buffer->state.conditional_render_enabled) {
ret = mi_new_gpr(b);
mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address));
} else {
/* Upload the current draw count from the draw parameters buffer to
* MI_PREDICATE_SRC0.
*/
mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address));
mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0));
}
return ret;
}
static void
emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
struct mi_builder *b,
uint32_t draw_index)
{
/* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index));
if (draw_index == 0) {
anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
mip.LoadOperation = LOAD_LOADINV;
mip.CombineOperation = COMBINE_SET;
mip.CompareOperation = COMPARE_SRCS_EQUAL;
}
} else {
/* While draw_index < draw_count the predicate's result will be
* (draw_index == draw_count) ^ TRUE = TRUE
* When draw_index == draw_count the result is
* (TRUE) ^ TRUE = FALSE
* After this all results will be:
* (FALSE) ^ FALSE = FALSE
*/
anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
mip.LoadOperation = LOAD_LOAD;
mip.CombineOperation = COMBINE_XOR;
mip.CompareOperation = COMPARE_SRCS_EQUAL;
}
}
}
static void
emit_draw_count_predicate_with_conditional_render(
struct anv_cmd_buffer *cmd_buffer,
struct mi_builder *b,
uint32_t draw_index,
struct mi_value max)
{
struct mi_value pred = mi_ult(b, mi_imm(draw_index), max);
pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG));
mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred);
}
static void
emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer,
struct mi_builder *b,
uint32_t draw_index,
struct mi_value max)
{
if (cmd_buffer->state.conditional_render_enabled) {
emit_draw_count_predicate_with_conditional_render(
cmd_buffer, b, draw_index, mi_value_ref(b, max));
} else {
emit_draw_count_predicate(cmd_buffer, b, draw_index);
}
}
static void
emit_indirect_count_draws(struct anv_cmd_buffer *cmd_buffer,
struct anv_address indirect_data_addr,
uint64_t indirect_data_stride,
struct anv_address draw_count_addr,
uint32_t max_draw_count,
bool indexed)
{
#if GFX_VER < 11
struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
#endif
cmd_buffer_flush_gfx_state(cmd_buffer);
struct mi_builder b;
mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &draw_count_addr);
mi_builder_set_mocs(&b, mocs);
struct mi_value max =
prepare_for_draw_count_predicate(cmd_buffer, &b, draw_count_addr);
for (uint32_t i = 0; i < max_draw_count; i++) {
struct anv_address draw =
anv_address_add(indirect_data_addr, i * indirect_data_stride);
emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
#if GFX_VER < 11
if (vs_prog_data->uses_firstvertex ||
vs_prog_data->uses_baseinstance) {
emit_base_vertex_instance_bo(cmd_buffer,
anv_address_add(draw, indexed ? 12 : 8));
}
if (vs_prog_data->uses_drawid)
emit_draw_index(cmd_buffer, i);
/* Emitting draw index or vertex index BOs may result in needing
* additional VF cache flushes.
*/
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
#endif
load_indirect_parameters(cmd_buffer, draw, indexed, i);
cmd_buffer_pre_draw_wa(cmd_buffer);
anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
#if GFX_VERx10 >= 125
prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
#endif
prim.IndirectParameterEnable = true;
prim.PredicateEnable = true;
prim.VertexAccessType = indexed ? RANDOM : SEQUENTIAL;
#if GFX_VER >= 11
prim.ExtendedParametersPresent = true;
#endif
}
cmd_buffer_post_draw_wa(cmd_buffer, 1, SEQUENTIAL);
}
mi_value_unref(&b, max);
}
void genX(CmdDrawIndirectCount)(
VkCommandBuffer commandBuffer,
VkBuffer _buffer,
VkDeviceSize offset,
VkBuffer _countBuffer,
VkDeviceSize countBufferOffset,
uint32_t maxDrawCount,
uint32_t stride)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
if (anv_batch_has_error(&cmd_buffer->batch))
return;
anv_measure_snapshot(cmd_buffer,
INTEL_SNAPSHOT_DRAW,
"draw indirect count",
0);
trace_intel_begin_draw_indirect_count(&cmd_buffer->trace);
struct anv_address indirect_data_address =
anv_address_add(buffer->address, offset);
struct anv_address count_address =
anv_address_add(count_buffer->address, countBufferOffset);
stride = MAX2(stride, sizeof(VkDrawIndirectCommand));
if (execute_indirect_draw_supported(cmd_buffer)) {
genX(cmd_buffer_emit_execute_indirect_draws)(
cmd_buffer,
indirect_data_address,
stride,
count_address,
maxDrawCount,
VK_CMD_DRAW_INDIRECT_COUNT);
} else if (anv_use_generated_draws(cmd_buffer, maxDrawCount)) {
genX(cmd_buffer_emit_indirect_generated_draws)(
cmd_buffer,
indirect_data_address,
stride,
count_address,
maxDrawCount,
false /* indexed */);
} else {
emit_indirect_count_draws(cmd_buffer,
indirect_data_address,
stride,
count_address,
maxDrawCount,
false /* indexed */);
}
trace_intel_end_draw_indirect_count(&cmd_buffer->trace,
anv_address_utrace(count_address),
pipeline->vs_source_hash,
pipeline->fs_source_hash);
}
void genX(CmdDrawIndexedIndirectCount)(
VkCommandBuffer commandBuffer,
VkBuffer _buffer,
VkDeviceSize offset,
VkBuffer _countBuffer,
VkDeviceSize countBufferOffset,
uint32_t maxDrawCount,
uint32_t stride)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
if (anv_batch_has_error(&cmd_buffer->batch))
return;
anv_measure_snapshot(cmd_buffer,
INTEL_SNAPSHOT_DRAW,
"draw indexed indirect count",
0);
trace_intel_begin_draw_indexed_indirect_count(&cmd_buffer->trace);
struct anv_address indirect_data_address =
anv_address_add(buffer->address, offset);
struct anv_address count_address =
anv_address_add(count_buffer->address, countBufferOffset);
stride = MAX2(stride, sizeof(VkDrawIndexedIndirectCommand));
if (execute_indirect_draw_supported(cmd_buffer)) {
genX(cmd_buffer_emit_execute_indirect_draws)(
cmd_buffer,
indirect_data_address,
stride,
count_address,
maxDrawCount,
VK_CMD_DRAW_INDEXED_INDIRECT_COUNT);
} else if (anv_use_generated_draws(cmd_buffer, maxDrawCount)) {
genX(cmd_buffer_emit_indirect_generated_draws)(
cmd_buffer,
indirect_data_address,
stride,
count_address,
maxDrawCount,
true /* indexed */);
} else {
emit_indirect_count_draws(cmd_buffer,
indirect_data_address,
stride,
count_address,
maxDrawCount,
true /* indexed */);
}
trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace,
anv_address_utrace(count_address),
pipeline->vs_source_hash,
pipeline->fs_source_hash);
}
void genX(CmdBeginTransformFeedbackEXT)(
VkCommandBuffer commandBuffer,
uint32_t firstCounterBuffer,
uint32_t counterBufferCount,
const VkBuffer* pCounterBuffers,
const VkDeviceSize* pCounterBufferOffsets)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
assert(firstCounterBuffer < MAX_XFB_BUFFERS);
assert(counterBufferCount <= MAX_XFB_BUFFERS);
assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
trace_intel_begin_xfb(&cmd_buffer->trace);
/* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
*
* "Ssoftware must ensure that no HW stream output operations can be in
* process or otherwise pending at the point that the MI_LOAD/STORE
* commands are processed. This will likely require a pipeline flush."
*/
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_CS_STALL_BIT,
"begin transform feedback");
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
struct mi_builder b;
mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
/* If we have a counter buffer, this is a resume so we need to load the
* value into the streamout offset register. Otherwise, this is a begin
* and we need to reset it to zero.
*/
if (pCounterBuffers &&
idx >= firstCounterBuffer &&
idx - firstCounterBuffer < counterBufferCount &&
pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {
uint32_t cb_idx = idx - firstCounterBuffer;
ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
uint64_t offset = pCounterBufferOffsets ?
pCounterBufferOffsets[cb_idx] : 0;
mi_store(&b, mi_reg32(GENX(SO_WRITE_OFFSET0_num) + idx * 4),
mi_mem32(anv_address_add(counter_buffer->address, offset)));
} else {
mi_store(&b, mi_reg32(GENX(SO_WRITE_OFFSET0_num) + idx * 4),
mi_imm(0));
}
}
cmd_buffer->state.xfb_enabled = true;
cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
}
void genX(CmdEndTransformFeedbackEXT)(
VkCommandBuffer commandBuffer,
uint32_t firstCounterBuffer,
uint32_t counterBufferCount,
const VkBuffer* pCounterBuffers,
const VkDeviceSize* pCounterBufferOffsets)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
assert(firstCounterBuffer < MAX_XFB_BUFFERS);
assert(counterBufferCount <= MAX_XFB_BUFFERS);
assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
/* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
*
* "Ssoftware must ensure that no HW stream output operations can be in
* process or otherwise pending at the point that the MI_LOAD/STORE
* commands are processed. This will likely require a pipeline flush."
*/
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_CS_STALL_BIT,
"end transform feedback");
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {
unsigned idx = firstCounterBuffer + cb_idx;
/* If we have a counter buffer, this is a resume so we need to load the
* value into the streamout offset register. Otherwise, this is a begin
* and we need to reset it to zero.
*/
if (pCounterBuffers &&
cb_idx < counterBufferCount &&
pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {
ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
uint64_t offset = pCounterBufferOffsets ?
pCounterBufferOffsets[cb_idx] : 0;
anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
srm.MemoryAddress = anv_address_add(counter_buffer->address,
offset);
srm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
}
}
}
trace_intel_end_xfb(&cmd_buffer->trace);
cmd_buffer->state.xfb_enabled = false;
cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
}
#if GFX_VERx10 >= 125
void
genX(CmdDrawMeshTasksEXT)(
VkCommandBuffer commandBuffer,
uint32_t x,
uint32_t y,
uint32_t z)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
if (anv_batch_has_error(&cmd_buffer->batch))
return;
anv_measure_snapshot(cmd_buffer,
INTEL_SNAPSHOT_DRAW,
"draw mesh", x * y * z);
trace_intel_begin_draw_mesh(&cmd_buffer->trace);
/* TODO(mesh): Check if this is not emitting more packets than we need. */
cmd_buffer_flush_gfx_state(cmd_buffer);
if (cmd_buffer->state.conditional_render_enabled)
genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
anv_batch_emit(&cmd_buffer->batch, GENX(3DMESH_3D), m) {
m.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
m.ThreadGroupCountX = x;
m.ThreadGroupCountY = y;
m.ThreadGroupCountZ = z;
}
trace_intel_end_draw_mesh(&cmd_buffer->trace, x, y, z);
}
#define GFX125_3DMESH_TG_COUNT 0x26F0
#define GFX10_3DPRIM_XP(n) (0x2690 + (n) * 4) /* n = { 0, 1, 2 } */
static void
mesh_load_indirect_parameters_3dmesh_3d(struct anv_cmd_buffer *cmd_buffer,
struct mi_builder *b,
struct anv_address addr,
bool emit_xp0,
uint32_t xp0)
{
const size_t groupCountXOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountX);
const size_t groupCountYOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountY);
const size_t groupCountZOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountZ);
mi_store(b, mi_reg32(GFX125_3DMESH_TG_COUNT),
mi_mem32(anv_address_add(addr, groupCountXOff)));
mi_store(b, mi_reg32(GFX10_3DPRIM_XP(1)),
mi_mem32(anv_address_add(addr, groupCountYOff)));
mi_store(b, mi_reg32(GFX10_3DPRIM_XP(2)),
mi_mem32(anv_address_add(addr, groupCountZOff)));
if (emit_xp0)
mi_store(b, mi_reg32(GFX10_3DPRIM_XP(0)), mi_imm(xp0));
}
static void
emit_indirect_3dmesh_3d(struct anv_batch *batch,
bool predicate_enable,
bool uses_drawid)
{
uint32_t len = GENX(3DMESH_3D_length) + uses_drawid;
uint32_t *dw = anv_batch_emitn(batch, len, GENX(3DMESH_3D),
.PredicateEnable = predicate_enable,
.IndirectParameterEnable = true,
.ExtendedParameter0Present = uses_drawid);
if (uses_drawid)
dw[len - 1] = 0;
}
void
genX(CmdDrawMeshTasksIndirectEXT)(
VkCommandBuffer commandBuffer,
VkBuffer _buffer,
VkDeviceSize offset,
uint32_t drawCount,
uint32_t stride)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
struct anv_cmd_state *cmd_state = &cmd_buffer->state;
if (anv_batch_has_error(&cmd_buffer->batch))
return;
anv_measure_snapshot(cmd_buffer,
INTEL_SNAPSHOT_DRAW,
"draw mesh indirect", drawCount);
trace_intel_begin_draw_mesh_indirect(&cmd_buffer->trace);
if (execute_indirect_draw_supported(cmd_buffer)) {
genX(cmd_buffer_emit_execute_indirect_draws)(
cmd_buffer,
anv_address_add(buffer->address, offset),
MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandEXT)),
ANV_NULL_ADDRESS /* count_addr */,
drawCount,
VK_CMD_DRAW_MESH_TASKS_INDIRECT_EXT);
trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, drawCount);
return;
}
cmd_buffer_flush_gfx_state(cmd_buffer);
if (cmd_state->conditional_render_enabled)
genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
mesh_prog_data->uses_drawid;
struct mi_builder b;
mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
for (uint32_t i = 0; i < drawCount; i++) {
struct anv_address draw = anv_address_add(buffer->address, offset);
mesh_load_indirect_parameters_3dmesh_3d(cmd_buffer, &b, draw, uses_drawid, i);
emit_indirect_3dmesh_3d(&cmd_buffer->batch,
cmd_state->conditional_render_enabled, uses_drawid);
offset += stride;
}
trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, drawCount);
}
void
genX(CmdDrawMeshTasksIndirectCountEXT)(
VkCommandBuffer commandBuffer,
VkBuffer _buffer,
VkDeviceSize offset,
VkBuffer _countBuffer,
VkDeviceSize countBufferOffset,
uint32_t maxDrawCount,
uint32_t stride)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
struct anv_graphics_pipeline *pipeline =
anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
if (anv_batch_has_error(&cmd_buffer->batch))
return;
anv_measure_snapshot(cmd_buffer,
INTEL_SNAPSHOT_DRAW,
"draw mesh indirect count", 0);
trace_intel_begin_draw_mesh_indirect_count(&cmd_buffer->trace);
struct anv_address count_addr =
anv_address_add(count_buffer->address, countBufferOffset);
if (execute_indirect_draw_supported(cmd_buffer)) {
genX(cmd_buffer_emit_execute_indirect_draws)(
cmd_buffer,
anv_address_add(buffer->address, offset),
MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandEXT)),
count_addr /* count_addr */,
maxDrawCount,
VK_CMD_DRAW_MESH_TASKS_INDIRECT_COUNT_EXT);
trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, maxDrawCount);
return;
}
cmd_buffer_flush_gfx_state(cmd_buffer);
bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
mesh_prog_data->uses_drawid;
struct mi_builder b;
mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &count_buffer->address);
mi_builder_set_mocs(&b, mocs);
struct mi_value max =
prepare_for_draw_count_predicate(
cmd_buffer, &b, count_addr);
for (uint32_t i = 0; i < maxDrawCount; i++) {
struct anv_address draw = anv_address_add(buffer->address, offset);
emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
mesh_load_indirect_parameters_3dmesh_3d(cmd_buffer, &b, draw, uses_drawid, i);
emit_indirect_3dmesh_3d(&cmd_buffer->batch, true, uses_drawid);
offset += stride;
}
trace_intel_end_draw_mesh_indirect_count(&cmd_buffer->trace,
anv_address_utrace(count_addr));
}
#endif /* GFX_VERx10 >= 125 */