blob: 1fa819ad284cf4fcdabb9447c6c8ad461e5f8485 [file] [log] [blame]
/*
* Copyright 2024 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#pragma once
#ifndef __OPENCL_VERSION__
#include <stdint.h>
#include "util/bitscan.h"
#endif
#include "compiler/shader_enums.h"
#include "util/enum_operators.h"
#ifdef __cplusplus
extern "C" {
#endif
/** A tri-state value to track states that are potentially dynamic */
enum intel_sometimes {
INTEL_NEVER = 0,
INTEL_SOMETIMES,
INTEL_ALWAYS
};
static inline enum intel_sometimes
intel_sometimes_invert(enum intel_sometimes x)
{
return (enum intel_sometimes)((int)INTEL_ALWAYS - (int)x);
}
#define INTEL_MSAA_FLAG_FIRST_VUE_SLOT_OFFSET (19)
#define INTEL_MSAA_FLAG_FIRST_VUE_SLOT_SIZE (6)
#define INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_OFFSET (25)
#define INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_SIZE (6)
#define INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_MESH (32)
enum intel_msaa_flags {
/** Must be set whenever any dynamic MSAA is used
*
* This flag mostly exists to let us assert that the driver understands
* dynamic MSAA so we don't run into trouble with drivers that don't.
*/
INTEL_MSAA_FLAG_ENABLE_DYNAMIC = (1 << 0),
/** True if the framebuffer is multisampled */
INTEL_MSAA_FLAG_MULTISAMPLE_FBO = (1 << 1),
/** True if this shader has been dispatched per-sample */
INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH = (1 << 2),
/** True if inputs should be interpolated per-sample by default */
INTEL_MSAA_FLAG_PERSAMPLE_INTERP = (1 << 3),
/** True if this shader has been dispatched with alpha-to-coverage */
INTEL_MSAA_FLAG_ALPHA_TO_COVERAGE = (1 << 4),
/** True if provoking vertex is last */
INTEL_MSAA_FLAG_PROVOKING_VERTEX_LAST = (1 << 5),
/** True if we need to apply Wa_18019110168 remapping */
INTEL_MSAA_FLAG_PER_PRIMITIVE_REMAPPING = (1 << 6),
/** True if this shader has been dispatched coarse
*
* This is intentionally chose to be bit 15 to correspond to the coarse bit
* in the pixel interpolator messages.
*/
INTEL_MSAA_FLAG_COARSE_PI_MSG = (1 << 15),
/** True if this shader has been dispatched coarse
*
* This is intentionally chose to be bit 18 to correspond to the coarse bit
* in the render target messages.
*/
INTEL_MSAA_FLAG_COARSE_RT_WRITES = (1 << 18),
/** First slot read in the VUE
*
* This is not a flag but a value that cover 6bits.
*/
INTEL_MSAA_FLAG_FIRST_VUE_SLOT = (1 << INTEL_MSAA_FLAG_FIRST_VUE_SLOT_OFFSET),
/** Index of the PrimitiveID attribute relative to the first read
* attribute.
*
* This is not a flag but a value that cover 6bits. Value 32 means the
* PrimitiveID is coming from the PerPrimitive block, written by the Mesh
* shader.
*/
INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX = (1 << INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_OFFSET),
};
MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(intel_msaa_flags)
/**
* @defgroup Tessellator parameter enumerations.
*
* These correspond to the hardware values in 3DSTATE_TE, and are provided
* as part of the tessellation evaluation shader.
*
* @{
*/
enum intel_tess_partitioning {
INTEL_TESS_PARTITIONING_INTEGER = 0,
INTEL_TESS_PARTITIONING_ODD_FRACTIONAL = 1,
INTEL_TESS_PARTITIONING_EVEN_FRACTIONAL = 2,
};
enum intel_tess_output_topology {
INTEL_TESS_OUTPUT_TOPOLOGY_POINT = 0,
INTEL_TESS_OUTPUT_TOPOLOGY_LINE = 1,
INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CW = 2,
INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CCW = 3,
};
enum intel_tess_domain {
INTEL_TESS_DOMAIN_QUAD = 0,
INTEL_TESS_DOMAIN_TRI = 1,
INTEL_TESS_DOMAIN_ISOLINE = 2,
};
/** @} */
enum intel_shader_dispatch_mode {
INTEL_DISPATCH_MODE_4X1_SINGLE = 0,
INTEL_DISPATCH_MODE_4X2_DUAL_INSTANCE = 1,
INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT = 2,
INTEL_DISPATCH_MODE_SIMD8 = 3,
INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH = 0,
INTEL_DISPATCH_MODE_TCS_MULTI_PATCH = 2,
};
enum intel_barycentric_mode {
INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL = 0,
INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID = 1,
INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE = 2,
INTEL_BARYCENTRIC_NONPERSPECTIVE_PIXEL = 3,
INTEL_BARYCENTRIC_NONPERSPECTIVE_CENTROID = 4,
INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE = 5,
INTEL_BARYCENTRIC_MODE_COUNT = 6
};
#define INTEL_BARYCENTRIC_PERSPECTIVE_BITS \
((1 << INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL) | \
(1 << INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID) | \
(1 << INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE))
#define INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS \
((1 << INTEL_BARYCENTRIC_NONPERSPECTIVE_PIXEL) | \
(1 << INTEL_BARYCENTRIC_NONPERSPECTIVE_CENTROID) | \
(1 << INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))
enum intel_vue_layout {
/**
* Layout is fixed and shared by producer/consumer, allowing for tigh
* packing
*/
INTEL_VUE_LAYOUT_FIXED = 0,
/**
* Layout is separate, works for ARB_separate_shader_objects but without
* Mesh support.
*/
INTEL_VUE_LAYOUT_SEPARATE,
/**
* Layout is separate and works with Mesh shaders.
*/
INTEL_VUE_LAYOUT_SEPARATE_MESH,
};
/**
* Data structure recording the relationship between the gl_varying_slot enum
* and "slots" within the vertex URB entry (VUE). A "slot" is defined as a
* single octaword within the VUE (128 bits).
*
* Note that each BRW register contains 256 bits (2 octawords), so when
* accessing the VUE in URB_NOSWIZZLE mode, each register corresponds to two
* consecutive VUE slots. When accessing the VUE in URB_INTERLEAVED mode (as
* in a vertex shader), each register corresponds to a single VUE slot, since
* it contains data for two separate vertices.
*/
struct intel_vue_map {
/**
* Bitfield representing all varying slots that are (a) stored in this VUE
* map, and (b) actually written by the shader. Does not include any of
* the additional varying slots defined in brw_varying_slot.
*/
uint64_t slots_valid;
/**
* The layout of the VUE
*
* Separable programs (GL_ARB_separate_shader_objects) can be mixed and
* matched without the linker having a chance to dead code eliminate unused
* varyings.
*
* This means that we have to use a fixed slot layout, based on the output's
* location field, rather than assigning slots in a compact contiguous block.
*
* When using Mesh, another constraint arises which is the HW limits for
* loading per-primitive & per-vertex data, limited to 32 varying in total.
* This requires us to be quite inventive with the way we lay things out.
* Take a fragment shader loading the following data :
*
* float gl_ClipDistance[];
* uint gl_PrimitiveID;
* vec4 someAppValue[29];
*
* According to the Vulkan spec, someAppValue will occupy 29 slots,
* gl_PrimitiveID 1 slot, gl_ClipDistance[] up to 2 slots. If the input is
* coming from a VS/DS/GS shader, we can load all of this through a single
* block using 3DSTATE_SBE::VertexURBEntryReadLength = 16 (maximum
* programmable value) and the layout with
* BRW_VUE_MAP_LAYOUT_FIXED/BRW_VUE_MAP_LAYOUT_SEPARATE will be this :
*
* -----------------------
* | gl_ClipDistance 0-3 |
* |---------------------|
* | gl_ClipDistance 4-7 |
* |---------------------|
* | gl_PrimitiveID |
* |---------------------|
* | someAppValue[] |
* |---------------------|
*
* This works nicely as everything is coming from the same location in the
* URB.
*
* When mesh shaders are involved, gl_PrimitiveID is located in a different
* place in the URB (the per-primitive block) and requires programming
* 3DSTATE_SBE_MESH::PerPrimitiveURBEntryOutputReadLength to load some
* additional data. The HW has a limit such that
* 3DSTATE_SBE_MESH::PerPrimitiveURBEntryOutputReadLength +
* 3DSTATE_SBE_MESH::PerVertexURBEntryOutputReadLength <= 16. With the
* layout above, we would not be able to accomodate that HW limit.
*
* The solution to this is to lay the built-in varyings out
* (gl_ClipDistance omitted since it's part of the VUE header and cannot
* live any other place) at the end of the VUE like this :
*
* -----------------------
* | gl_ClipDistance 0-3 |
* |---------------------|
* | gl_ClipDistance 4-7 |
* |---------------------|
* | someAppValue[] |
* |---------------------|
* | gl_PrimitiveID |
* |---------------------|
*
* This layout adds another challenge because with separate shader
* compilations, we cannot tell in the consumer shader how many outputs the
* producer has, so we don't know where the gl_PrimitiveID lives. The
* solution to this other problem is to read the built-in with a
* MOV_INDIRECT and have the offset of the MOV_INDIRECT loaded through a
* push constant.
*/
enum intel_vue_layout layout;
/**
* Map from gl_varying_slot value to VUE slot. For gl_varying_slots that are
* not stored in a slot (because they are not written, or because
* additional processing is applied before storing them in the VUE), the
* value is -1.
*/
signed char varying_to_slot[VARYING_SLOT_TESS_MAX];
/**
* Map from VUE slot to gl_varying_slot value. For slots that do not
* directly correspond to a gl_varying_slot, the value comes from
* brw_varying_slot.
*
* For slots that are not in use, the value is BRW_VARYING_SLOT_PAD.
*/
signed char slot_to_varying[VARYING_SLOT_TESS_MAX];
/**
* Total number of VUE slots in use
*/
int num_slots;
/**
* Number of position VUE slots. If num_pos_slots > 1, primitive
* replication is being used.
*/
int num_pos_slots;
/**
* Number of per-patch VUE slots. Only valid for tessellation control
* shader outputs and tessellation evaluation shader inputs.
*/
int num_per_patch_slots;
/**
* Number of per-vertex VUE slots. Only valid for tessellation control
* shader outputs and tessellation evaluation shader inputs.
*/
int num_per_vertex_slots;
};
struct intel_cs_dispatch_info {
uint32_t group_size;
uint32_t simd_size;
uint32_t threads;
/* RightExecutionMask field used in GPGPU_WALKER. */
uint32_t right_mask;
};
enum intel_compute_walk_order {
INTEL_WALK_ORDER_XYZ = 0,
INTEL_WALK_ORDER_XZY = 1,
INTEL_WALK_ORDER_YXZ = 2,
INTEL_WALK_ORDER_YZX = 3,
INTEL_WALK_ORDER_ZXY = 4,
INTEL_WALK_ORDER_ZYX = 5,
};
static inline bool
intel_fs_is_persample(enum intel_sometimes shader_persample_dispatch,
bool shader_per_sample_shading,
enum intel_msaa_flags pushed_msaa_flags)
{
if (shader_persample_dispatch != INTEL_SOMETIMES)
return shader_persample_dispatch;
assert(pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC);
if (!(pushed_msaa_flags & INTEL_MSAA_FLAG_MULTISAMPLE_FBO))
return false;
if (shader_per_sample_shading)
assert(pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
return (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH) != 0;
}
static inline uint32_t
intel_fs_barycentric_modes(enum intel_sometimes shader_persample_dispatch,
uint32_t shader_barycentric_modes,
enum intel_msaa_flags pushed_msaa_flags)
{
/* In the non dynamic case, we can just return the computed shader_barycentric_modes from
* compilation time.
*/
if (shader_persample_dispatch != INTEL_SOMETIMES)
return shader_barycentric_modes;
uint32_t modes = shader_barycentric_modes;
assert(pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC);
if (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_INTERP) {
assert(pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
/* Making dynamic per-sample interpolation work is a bit tricky. The
* hardware will hang if SAMPLE is requested but per-sample dispatch is
* not enabled. This means we can't preemptively add SAMPLE to the
* barycentrics bitfield. Instead, we have to add it late and only
* on-demand. Annoyingly, changing the number of barycentrics requested
* changes the whole PS shader payload so we very much don't want to do
* that. Instead, if the dynamic per-sample interpolation flag is set,
* we check to see if SAMPLE was requested and, if not, replace the
* highest barycentric bit in the [non]perspective grouping (CENTROID,
* if it exists, else PIXEL) with SAMPLE. The shader will stomp all the
* barycentrics in the shader with SAMPLE so it really doesn't matter
* which one we replace. The important thing is that we keep the number
* of barycentrics in each [non]perspective grouping the same.
*/
if ((modes & INTEL_BARYCENTRIC_PERSPECTIVE_BITS) &&
!(modes & BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE))) {
int sample_mode =
util_last_bit(modes & INTEL_BARYCENTRIC_PERSPECTIVE_BITS) - 1;
assert(modes & BITFIELD_BIT(sample_mode));
modes &= ~BITFIELD_BIT(sample_mode);
modes |= BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE);
}
if ((modes & INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) &&
!(modes & BITFIELD_BIT(INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))) {
int sample_mode =
util_last_bit(modes & INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) - 1;
assert(modes & BITFIELD_BIT(sample_mode));
modes &= ~BITFIELD_BIT(sample_mode);
modes |= BITFIELD_BIT(INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE);
}
} else {
/* If we're not using per-sample interpolation, we need to disable the
* per-sample bits.
*
* SKL PRMs, Volume 2a: Command Reference: Instructions,
* 3DSTATE_WM:Barycentric Interpolation Mode:
* "MSDISPMODE_PERSAMPLE is required in order to select Perspective
* Sample or Non-perspective Sample barycentric coordinates."
*/
uint32_t sample_bits = (BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE) |
BITFIELD_BIT(INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE));
uint32_t requested_sample = modes & sample_bits;
modes &= ~sample_bits;
/*
* If the shader requested some sample modes and we have to disable
* them, make sure we add back the pixel variant back to not mess up the
* thread payload.
*
* Why does this works out? Because of the ordering in the thread payload :
*
* R7:10 Perspective Centroid Barycentric
* R11:14 Perspective Sample Barycentric
* R15:18 Linear Pixel Location Barycentric
*
* In the backend when persample dispatch is dynamic, we always select
* the sample barycentric and turn off the pixel location (even if
* requested through intrinsics). That way when we dynamically select
* pixel or sample dispatch, the barycentric always match, since the
* pixel location barycentric register offset will align with the sample
* barycentric.
*/
if (requested_sample) {
if (requested_sample & BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE))
modes |= BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL);
if (requested_sample & BITFIELD_BIT(INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))
modes |= BITFIELD_BIT(INTEL_BARYCENTRIC_NONPERSPECTIVE_PIXEL);
}
}
return modes;
}
static inline bool
intel_fs_is_coarse(enum intel_sometimes shader_coarse_pixel_dispatch,
enum intel_msaa_flags pushed_msaa_flags)
{
if (shader_coarse_pixel_dispatch != INTEL_SOMETIMES)
return shader_coarse_pixel_dispatch;
assert(pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC);
assert((pushed_msaa_flags & INTEL_MSAA_FLAG_COARSE_RT_WRITES) ?
shader_coarse_pixel_dispatch != INTEL_NEVER :
shader_coarse_pixel_dispatch != INTEL_ALWAYS);
return (pushed_msaa_flags & INTEL_MSAA_FLAG_COARSE_RT_WRITES) != 0;
}
struct intel_fs_params {
bool shader_sample_shading;
float shader_min_sample_shading;
bool state_sample_shading;
uint32_t rasterization_samples;
bool coarse_pixel;
bool alpha_to_coverage;
bool provoking_vertex_last;
uint32_t first_vue_slot;
uint32_t primitive_id_index;
bool per_primitive_remapping;
};
static inline enum intel_msaa_flags
intel_fs_msaa_flags(struct intel_fs_params params)
{
enum intel_msaa_flags fs_msaa_flags = INTEL_MSAA_FLAG_ENABLE_DYNAMIC;
if (params.rasterization_samples > 1) {
fs_msaa_flags |= INTEL_MSAA_FLAG_MULTISAMPLE_FBO;
if (params.shader_sample_shading)
fs_msaa_flags |= INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH;
if (params.shader_sample_shading ||
(params.state_sample_shading &&
(params.shader_min_sample_shading *
params.rasterization_samples) > 1)) {
fs_msaa_flags |= INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH |
INTEL_MSAA_FLAG_PERSAMPLE_INTERP;
}
}
if (!(fs_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH) &&
params.coarse_pixel) {
fs_msaa_flags |= INTEL_MSAA_FLAG_COARSE_PI_MSG |
INTEL_MSAA_FLAG_COARSE_RT_WRITES;
}
if (params.alpha_to_coverage)
fs_msaa_flags |= INTEL_MSAA_FLAG_ALPHA_TO_COVERAGE;
assert(params.first_vue_slot < (1 << INTEL_MSAA_FLAG_FIRST_VUE_SLOT_SIZE));
fs_msaa_flags |= (enum intel_msaa_flags)(
params.first_vue_slot << INTEL_MSAA_FLAG_FIRST_VUE_SLOT_OFFSET);
assert(params.primitive_id_index < (1u << INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_SIZE));
fs_msaa_flags |= (enum intel_msaa_flags)(
params.primitive_id_index << INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_OFFSET);
if (params.provoking_vertex_last)
fs_msaa_flags |= INTEL_MSAA_FLAG_PROVOKING_VERTEX_LAST;
if (params.per_primitive_remapping)
fs_msaa_flags |= INTEL_MSAA_FLAG_PER_PRIMITIVE_REMAPPING;
return fs_msaa_flags;
}
#ifdef __cplusplus
} /* extern "C" */
#endif