| /* |
| * Copyright 2024 Intel Corporation |
| * SPDX-License-Identifier: MIT |
| */ |
| |
| #pragma once |
| |
| #ifndef __OPENCL_VERSION__ |
| #include <stdint.h> |
| #include "util/bitscan.h" |
| #endif |
| |
| #include "compiler/shader_enums.h" |
| #include "util/enum_operators.h" |
| |
| #ifdef __cplusplus |
| extern "C" { |
| #endif |
| |
| /** A tri-state value to track states that are potentially dynamic */ |
| enum intel_sometimes { |
| INTEL_NEVER = 0, |
| INTEL_SOMETIMES, |
| INTEL_ALWAYS |
| }; |
| |
| static inline enum intel_sometimes |
| intel_sometimes_invert(enum intel_sometimes x) |
| { |
| return (enum intel_sometimes)((int)INTEL_ALWAYS - (int)x); |
| } |
| |
| #define INTEL_MSAA_FLAG_FIRST_VUE_SLOT_OFFSET (19) |
| #define INTEL_MSAA_FLAG_FIRST_VUE_SLOT_SIZE (6) |
| #define INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_OFFSET (25) |
| #define INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_SIZE (6) |
| #define INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_MESH (32) |
| |
| enum intel_msaa_flags { |
| /** Must be set whenever any dynamic MSAA is used |
| * |
| * This flag mostly exists to let us assert that the driver understands |
| * dynamic MSAA so we don't run into trouble with drivers that don't. |
| */ |
| INTEL_MSAA_FLAG_ENABLE_DYNAMIC = (1 << 0), |
| |
| /** True if the framebuffer is multisampled */ |
| INTEL_MSAA_FLAG_MULTISAMPLE_FBO = (1 << 1), |
| |
| /** True if this shader has been dispatched per-sample */ |
| INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH = (1 << 2), |
| |
| /** True if inputs should be interpolated per-sample by default */ |
| INTEL_MSAA_FLAG_PERSAMPLE_INTERP = (1 << 3), |
| |
| /** True if this shader has been dispatched with alpha-to-coverage */ |
| INTEL_MSAA_FLAG_ALPHA_TO_COVERAGE = (1 << 4), |
| |
| /** True if provoking vertex is last */ |
| INTEL_MSAA_FLAG_PROVOKING_VERTEX_LAST = (1 << 5), |
| |
| /** True if we need to apply Wa_18019110168 remapping */ |
| INTEL_MSAA_FLAG_PER_PRIMITIVE_REMAPPING = (1 << 6), |
| |
| /** True if this shader has been dispatched coarse |
| * |
| * This is intentionally chose to be bit 15 to correspond to the coarse bit |
| * in the pixel interpolator messages. |
| */ |
| INTEL_MSAA_FLAG_COARSE_PI_MSG = (1 << 15), |
| |
| /** True if this shader has been dispatched coarse |
| * |
| * This is intentionally chose to be bit 18 to correspond to the coarse bit |
| * in the render target messages. |
| */ |
| INTEL_MSAA_FLAG_COARSE_RT_WRITES = (1 << 18), |
| |
| /** First slot read in the VUE |
| * |
| * This is not a flag but a value that cover 6bits. |
| */ |
| INTEL_MSAA_FLAG_FIRST_VUE_SLOT = (1 << INTEL_MSAA_FLAG_FIRST_VUE_SLOT_OFFSET), |
| |
| /** Index of the PrimitiveID attribute relative to the first read |
| * attribute. |
| * |
| * This is not a flag but a value that cover 6bits. Value 32 means the |
| * PrimitiveID is coming from the PerPrimitive block, written by the Mesh |
| * shader. |
| */ |
| INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX = (1 << INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_OFFSET), |
| }; |
| MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(intel_msaa_flags) |
| |
| /** |
| * @defgroup Tessellator parameter enumerations. |
| * |
| * These correspond to the hardware values in 3DSTATE_TE, and are provided |
| * as part of the tessellation evaluation shader. |
| * |
| * @{ |
| */ |
| enum intel_tess_partitioning { |
| INTEL_TESS_PARTITIONING_INTEGER = 0, |
| INTEL_TESS_PARTITIONING_ODD_FRACTIONAL = 1, |
| INTEL_TESS_PARTITIONING_EVEN_FRACTIONAL = 2, |
| }; |
| |
| enum intel_tess_output_topology { |
| INTEL_TESS_OUTPUT_TOPOLOGY_POINT = 0, |
| INTEL_TESS_OUTPUT_TOPOLOGY_LINE = 1, |
| INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CW = 2, |
| INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CCW = 3, |
| }; |
| |
| enum intel_tess_domain { |
| INTEL_TESS_DOMAIN_QUAD = 0, |
| INTEL_TESS_DOMAIN_TRI = 1, |
| INTEL_TESS_DOMAIN_ISOLINE = 2, |
| }; |
| /** @} */ |
| |
| enum intel_shader_dispatch_mode { |
| INTEL_DISPATCH_MODE_4X1_SINGLE = 0, |
| INTEL_DISPATCH_MODE_4X2_DUAL_INSTANCE = 1, |
| INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT = 2, |
| INTEL_DISPATCH_MODE_SIMD8 = 3, |
| |
| INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH = 0, |
| INTEL_DISPATCH_MODE_TCS_MULTI_PATCH = 2, |
| }; |
| |
| enum intel_barycentric_mode { |
| INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL = 0, |
| INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID = 1, |
| INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE = 2, |
| INTEL_BARYCENTRIC_NONPERSPECTIVE_PIXEL = 3, |
| INTEL_BARYCENTRIC_NONPERSPECTIVE_CENTROID = 4, |
| INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE = 5, |
| INTEL_BARYCENTRIC_MODE_COUNT = 6 |
| }; |
| #define INTEL_BARYCENTRIC_PERSPECTIVE_BITS \ |
| ((1 << INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL) | \ |
| (1 << INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID) | \ |
| (1 << INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE)) |
| #define INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS \ |
| ((1 << INTEL_BARYCENTRIC_NONPERSPECTIVE_PIXEL) | \ |
| (1 << INTEL_BARYCENTRIC_NONPERSPECTIVE_CENTROID) | \ |
| (1 << INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE)) |
| |
| enum intel_vue_layout { |
| /** |
| * Layout is fixed and shared by producer/consumer, allowing for tigh |
| * packing |
| */ |
| INTEL_VUE_LAYOUT_FIXED = 0, |
| /** |
| * Layout is separate, works for ARB_separate_shader_objects but without |
| * Mesh support. |
| */ |
| INTEL_VUE_LAYOUT_SEPARATE, |
| /** |
| * Layout is separate and works with Mesh shaders. |
| */ |
| INTEL_VUE_LAYOUT_SEPARATE_MESH, |
| }; |
| |
| /** |
| * Data structure recording the relationship between the gl_varying_slot enum |
| * and "slots" within the vertex URB entry (VUE). A "slot" is defined as a |
| * single octaword within the VUE (128 bits). |
| * |
| * Note that each BRW register contains 256 bits (2 octawords), so when |
| * accessing the VUE in URB_NOSWIZZLE mode, each register corresponds to two |
| * consecutive VUE slots. When accessing the VUE in URB_INTERLEAVED mode (as |
| * in a vertex shader), each register corresponds to a single VUE slot, since |
| * it contains data for two separate vertices. |
| */ |
| struct intel_vue_map { |
| /** |
| * Bitfield representing all varying slots that are (a) stored in this VUE |
| * map, and (b) actually written by the shader. Does not include any of |
| * the additional varying slots defined in brw_varying_slot. |
| */ |
| uint64_t slots_valid; |
| |
| /** |
| * The layout of the VUE |
| * |
| * Separable programs (GL_ARB_separate_shader_objects) can be mixed and |
| * matched without the linker having a chance to dead code eliminate unused |
| * varyings. |
| * |
| * This means that we have to use a fixed slot layout, based on the output's |
| * location field, rather than assigning slots in a compact contiguous block. |
| * |
| * When using Mesh, another constraint arises which is the HW limits for |
| * loading per-primitive & per-vertex data, limited to 32 varying in total. |
| * This requires us to be quite inventive with the way we lay things out. |
| * Take a fragment shader loading the following data : |
| * |
| * float gl_ClipDistance[]; |
| * uint gl_PrimitiveID; |
| * vec4 someAppValue[29]; |
| * |
| * According to the Vulkan spec, someAppValue will occupy 29 slots, |
| * gl_PrimitiveID 1 slot, gl_ClipDistance[] up to 2 slots. If the input is |
| * coming from a VS/DS/GS shader, we can load all of this through a single |
| * block using 3DSTATE_SBE::VertexURBEntryReadLength = 16 (maximum |
| * programmable value) and the layout with |
| * BRW_VUE_MAP_LAYOUT_FIXED/BRW_VUE_MAP_LAYOUT_SEPARATE will be this : |
| * |
| * ----------------------- |
| * | gl_ClipDistance 0-3 | |
| * |---------------------| |
| * | gl_ClipDistance 4-7 | |
| * |---------------------| |
| * | gl_PrimitiveID | |
| * |---------------------| |
| * | someAppValue[] | |
| * |---------------------| |
| * |
| * This works nicely as everything is coming from the same location in the |
| * URB. |
| * |
| * When mesh shaders are involved, gl_PrimitiveID is located in a different |
| * place in the URB (the per-primitive block) and requires programming |
| * 3DSTATE_SBE_MESH::PerPrimitiveURBEntryOutputReadLength to load some |
| * additional data. The HW has a limit such that |
| * 3DSTATE_SBE_MESH::PerPrimitiveURBEntryOutputReadLength + |
| * 3DSTATE_SBE_MESH::PerVertexURBEntryOutputReadLength <= 16. With the |
| * layout above, we would not be able to accomodate that HW limit. |
| * |
| * The solution to this is to lay the built-in varyings out |
| * (gl_ClipDistance omitted since it's part of the VUE header and cannot |
| * live any other place) at the end of the VUE like this : |
| * |
| * ----------------------- |
| * | gl_ClipDistance 0-3 | |
| * |---------------------| |
| * | gl_ClipDistance 4-7 | |
| * |---------------------| |
| * | someAppValue[] | |
| * |---------------------| |
| * | gl_PrimitiveID | |
| * |---------------------| |
| * |
| * This layout adds another challenge because with separate shader |
| * compilations, we cannot tell in the consumer shader how many outputs the |
| * producer has, so we don't know where the gl_PrimitiveID lives. The |
| * solution to this other problem is to read the built-in with a |
| * MOV_INDIRECT and have the offset of the MOV_INDIRECT loaded through a |
| * push constant. |
| */ |
| enum intel_vue_layout layout; |
| |
| /** |
| * Map from gl_varying_slot value to VUE slot. For gl_varying_slots that are |
| * not stored in a slot (because they are not written, or because |
| * additional processing is applied before storing them in the VUE), the |
| * value is -1. |
| */ |
| signed char varying_to_slot[VARYING_SLOT_TESS_MAX]; |
| |
| /** |
| * Map from VUE slot to gl_varying_slot value. For slots that do not |
| * directly correspond to a gl_varying_slot, the value comes from |
| * brw_varying_slot. |
| * |
| * For slots that are not in use, the value is BRW_VARYING_SLOT_PAD. |
| */ |
| signed char slot_to_varying[VARYING_SLOT_TESS_MAX]; |
| |
| /** |
| * Total number of VUE slots in use |
| */ |
| int num_slots; |
| |
| /** |
| * Number of position VUE slots. If num_pos_slots > 1, primitive |
| * replication is being used. |
| */ |
| int num_pos_slots; |
| |
| /** |
| * Number of per-patch VUE slots. Only valid for tessellation control |
| * shader outputs and tessellation evaluation shader inputs. |
| */ |
| int num_per_patch_slots; |
| |
| /** |
| * Number of per-vertex VUE slots. Only valid for tessellation control |
| * shader outputs and tessellation evaluation shader inputs. |
| */ |
| int num_per_vertex_slots; |
| }; |
| |
| struct intel_cs_dispatch_info { |
| uint32_t group_size; |
| uint32_t simd_size; |
| uint32_t threads; |
| |
| /* RightExecutionMask field used in GPGPU_WALKER. */ |
| uint32_t right_mask; |
| }; |
| |
| enum intel_compute_walk_order { |
| INTEL_WALK_ORDER_XYZ = 0, |
| INTEL_WALK_ORDER_XZY = 1, |
| INTEL_WALK_ORDER_YXZ = 2, |
| INTEL_WALK_ORDER_YZX = 3, |
| INTEL_WALK_ORDER_ZXY = 4, |
| INTEL_WALK_ORDER_ZYX = 5, |
| }; |
| |
| static inline bool |
| intel_fs_is_persample(enum intel_sometimes shader_persample_dispatch, |
| bool shader_per_sample_shading, |
| enum intel_msaa_flags pushed_msaa_flags) |
| { |
| if (shader_persample_dispatch != INTEL_SOMETIMES) |
| return shader_persample_dispatch; |
| |
| assert(pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC); |
| |
| if (!(pushed_msaa_flags & INTEL_MSAA_FLAG_MULTISAMPLE_FBO)) |
| return false; |
| |
| if (shader_per_sample_shading) |
| assert(pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH); |
| |
| return (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH) != 0; |
| } |
| |
| static inline uint32_t |
| intel_fs_barycentric_modes(enum intel_sometimes shader_persample_dispatch, |
| uint32_t shader_barycentric_modes, |
| enum intel_msaa_flags pushed_msaa_flags) |
| { |
| /* In the non dynamic case, we can just return the computed shader_barycentric_modes from |
| * compilation time. |
| */ |
| if (shader_persample_dispatch != INTEL_SOMETIMES) |
| return shader_barycentric_modes; |
| |
| uint32_t modes = shader_barycentric_modes; |
| |
| assert(pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC); |
| |
| if (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_INTERP) { |
| assert(pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH); |
| |
| /* Making dynamic per-sample interpolation work is a bit tricky. The |
| * hardware will hang if SAMPLE is requested but per-sample dispatch is |
| * not enabled. This means we can't preemptively add SAMPLE to the |
| * barycentrics bitfield. Instead, we have to add it late and only |
| * on-demand. Annoyingly, changing the number of barycentrics requested |
| * changes the whole PS shader payload so we very much don't want to do |
| * that. Instead, if the dynamic per-sample interpolation flag is set, |
| * we check to see if SAMPLE was requested and, if not, replace the |
| * highest barycentric bit in the [non]perspective grouping (CENTROID, |
| * if it exists, else PIXEL) with SAMPLE. The shader will stomp all the |
| * barycentrics in the shader with SAMPLE so it really doesn't matter |
| * which one we replace. The important thing is that we keep the number |
| * of barycentrics in each [non]perspective grouping the same. |
| */ |
| if ((modes & INTEL_BARYCENTRIC_PERSPECTIVE_BITS) && |
| !(modes & BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE))) { |
| int sample_mode = |
| util_last_bit(modes & INTEL_BARYCENTRIC_PERSPECTIVE_BITS) - 1; |
| assert(modes & BITFIELD_BIT(sample_mode)); |
| |
| modes &= ~BITFIELD_BIT(sample_mode); |
| modes |= BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE); |
| } |
| |
| if ((modes & INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) && |
| !(modes & BITFIELD_BIT(INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))) { |
| int sample_mode = |
| util_last_bit(modes & INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) - 1; |
| assert(modes & BITFIELD_BIT(sample_mode)); |
| |
| modes &= ~BITFIELD_BIT(sample_mode); |
| modes |= BITFIELD_BIT(INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE); |
| } |
| } else { |
| /* If we're not using per-sample interpolation, we need to disable the |
| * per-sample bits. |
| * |
| * SKL PRMs, Volume 2a: Command Reference: Instructions, |
| * 3DSTATE_WM:Barycentric Interpolation Mode: |
| |
| * "MSDISPMODE_PERSAMPLE is required in order to select Perspective |
| * Sample or Non-perspective Sample barycentric coordinates." |
| */ |
| uint32_t sample_bits = (BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE) | |
| BITFIELD_BIT(INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE)); |
| uint32_t requested_sample = modes & sample_bits; |
| modes &= ~sample_bits; |
| /* |
| * If the shader requested some sample modes and we have to disable |
| * them, make sure we add back the pixel variant back to not mess up the |
| * thread payload. |
| * |
| * Why does this works out? Because of the ordering in the thread payload : |
| * |
| * R7:10 Perspective Centroid Barycentric |
| * R11:14 Perspective Sample Barycentric |
| * R15:18 Linear Pixel Location Barycentric |
| * |
| * In the backend when persample dispatch is dynamic, we always select |
| * the sample barycentric and turn off the pixel location (even if |
| * requested through intrinsics). That way when we dynamically select |
| * pixel or sample dispatch, the barycentric always match, since the |
| * pixel location barycentric register offset will align with the sample |
| * barycentric. |
| */ |
| if (requested_sample) { |
| if (requested_sample & BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE)) |
| modes |= BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL); |
| if (requested_sample & BITFIELD_BIT(INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE)) |
| modes |= BITFIELD_BIT(INTEL_BARYCENTRIC_NONPERSPECTIVE_PIXEL); |
| } |
| } |
| |
| return modes; |
| } |
| |
| |
| static inline bool |
| intel_fs_is_coarse(enum intel_sometimes shader_coarse_pixel_dispatch, |
| enum intel_msaa_flags pushed_msaa_flags) |
| { |
| if (shader_coarse_pixel_dispatch != INTEL_SOMETIMES) |
| return shader_coarse_pixel_dispatch; |
| |
| assert(pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC); |
| |
| assert((pushed_msaa_flags & INTEL_MSAA_FLAG_COARSE_RT_WRITES) ? |
| shader_coarse_pixel_dispatch != INTEL_NEVER : |
| shader_coarse_pixel_dispatch != INTEL_ALWAYS); |
| |
| return (pushed_msaa_flags & INTEL_MSAA_FLAG_COARSE_RT_WRITES) != 0; |
| } |
| |
| struct intel_fs_params { |
| bool shader_sample_shading; |
| float shader_min_sample_shading; |
| bool state_sample_shading; |
| uint32_t rasterization_samples; |
| bool coarse_pixel; |
| bool alpha_to_coverage; |
| bool provoking_vertex_last; |
| uint32_t first_vue_slot; |
| uint32_t primitive_id_index; |
| bool per_primitive_remapping; |
| }; |
| |
| static inline enum intel_msaa_flags |
| intel_fs_msaa_flags(struct intel_fs_params params) |
| { |
| enum intel_msaa_flags fs_msaa_flags = INTEL_MSAA_FLAG_ENABLE_DYNAMIC; |
| |
| if (params.rasterization_samples > 1) { |
| fs_msaa_flags |= INTEL_MSAA_FLAG_MULTISAMPLE_FBO; |
| |
| if (params.shader_sample_shading) |
| fs_msaa_flags |= INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH; |
| |
| if (params.shader_sample_shading || |
| (params.state_sample_shading && |
| (params.shader_min_sample_shading * |
| params.rasterization_samples) > 1)) { |
| fs_msaa_flags |= INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH | |
| INTEL_MSAA_FLAG_PERSAMPLE_INTERP; |
| } |
| } |
| |
| if (!(fs_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH) && |
| params.coarse_pixel) { |
| fs_msaa_flags |= INTEL_MSAA_FLAG_COARSE_PI_MSG | |
| INTEL_MSAA_FLAG_COARSE_RT_WRITES; |
| } |
| |
| if (params.alpha_to_coverage) |
| fs_msaa_flags |= INTEL_MSAA_FLAG_ALPHA_TO_COVERAGE; |
| |
| assert(params.first_vue_slot < (1 << INTEL_MSAA_FLAG_FIRST_VUE_SLOT_SIZE)); |
| fs_msaa_flags |= (enum intel_msaa_flags)( |
| params.first_vue_slot << INTEL_MSAA_FLAG_FIRST_VUE_SLOT_OFFSET); |
| |
| assert(params.primitive_id_index < (1u << INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_SIZE)); |
| fs_msaa_flags |= (enum intel_msaa_flags)( |
| params.primitive_id_index << INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_OFFSET); |
| |
| if (params.provoking_vertex_last) |
| fs_msaa_flags |= INTEL_MSAA_FLAG_PROVOKING_VERTEX_LAST; |
| |
| if (params.per_primitive_remapping) |
| fs_msaa_flags |= INTEL_MSAA_FLAG_PER_PRIMITIVE_REMAPPING; |
| |
| return fs_msaa_flags; |
| } |
| |
| #ifdef __cplusplus |
| } /* extern "C" */ |
| #endif |