blob: d2e61096d1439bb724049e231906980edfbcd651 [file] [log] [blame]
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file pa.h
*
* @brief Definitions for primitive assembly.
* N primitives are assembled at a time, where N is the SIMD width.
* A state machine, that is specific for a given topology, drives the
* assembly of vertices into triangles.
*
******************************************************************************/
#pragma once
#include "frontend.h"
struct PA_STATE
{
#if USE_SIMD16_FRONTEND
enum
{
SIMD_WIDTH = KNOB_SIMD16_WIDTH,
SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
SIMD_WIDTH_LOG2 = 4
};
typedef simd16mask SIMDMASK;
typedef simd16scalar SIMDSCALAR;
typedef simd16vector SIMDVECTOR;
typedef simd16vertex SIMDVERTEX;
typedef simd16scalari SIMDSCALARI;
#else
enum
{
SIMD_WIDTH = KNOB_SIMD_WIDTH,
SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
SIMD_WIDTH_LOG2 = 3
};
typedef simdmask SIMDMASK;
typedef simdscalar SIMDSCALAR;
typedef simdvector SIMDVECTOR;
typedef simdvertex SIMDVERTEX;
typedef simdscalari SIMDSCALARI;
#endif
DRAW_CONTEXT *pDC{ nullptr }; // draw context
uint8_t* pStreamBase{ nullptr }; // vertex stream
uint32_t streamSizeInVerts{ 0 }; // total size of the input stream in verts
uint32_t vertexStride{ 0 }; // stride of a vertex in simdvector units
// The topology the binner will use. In some cases the FE changes the topology from the api state.
PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
#if ENABLE_AVX512_SIMD16
bool useAlternateOffset{ false };
#endif
PA_STATE() {}
PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride) :
pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), vertexStride(in_vertexStride) {}
virtual bool HasWork() = 0;
virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
#if ENABLE_AVX512_SIMD16
virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
#endif
virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
#if ENABLE_AVX512_SIMD16
virtual bool Assemble_simd16(uint32_t slot, simd16vector verts[]) = 0;
#endif
virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
virtual bool NextPrim() = 0;
virtual SIMDVERTEX& GetNextVsOutput() = 0;
virtual bool GetNextStreamOutput() = 0;
virtual SIMDMASK& GetNextVsIndices() = 0;
virtual uint32_t NumPrims() = 0;
virtual void Reset() = 0;
virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0;
};
// The Optimized PA is a state machine that assembles triangles from vertex shader simd
// output. Here is the sequence
// 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
// 2. Execute PA function to assemble and bin triangles.
// a. The PA function is a set of functions that collectively make up the
// state machine for a given topology.
// 1. We use a state index to track which PA function to call.
// b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
// 1. We call this the current and previous simd vertex.
// 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
// order to assemble the second triangle, for a triangle list, we'll need the
// last vertex from the previous simd and the first 2 vertices from the current simd.
// 3. At times the PA can assemble multiple triangles from the 2 simd vertices.
//
// This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
// cuts
struct PA_STATE_OPT : public PA_STATE
{
uint32_t numPrims{ 0 }; // Total number of primitives for draw.
uint32_t numPrimsComplete{ 0 }; // Total number of complete primitives.
uint32_t numSimdPrims{ 0 }; // Number of prims in current simd.
uint32_t cur{ 0 }; // index to current VS output.
uint32_t prev{ 0 }; // index to prev VS output. Not really needed in the state.
const uint32_t first{ 0 }; // index to first VS output. Used for tri fan and line loop.
uint32_t counter{ 0 }; // state counter
bool reset{ false }; // reset state
uint32_t primIDIncr{ 0 }; // how much to increment for each vector (typically vector / {1, 2})
SIMDSCALARI primID;
typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
#if ENABLE_AVX512_SIMD16
typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
PFN_PA_FUNC pfnPaFunc{ nullptr }; // PA state machine function for assembling 4 triangles.
#if ENABLE_AVX512_SIMD16
PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{ nullptr };
#endif
PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr }; // PA state machine function for assembling single triangle.
PFN_PA_FUNC pfnPaFuncReset{ nullptr }; // initial state to set on reset
#if ENABLE_AVX512_SIMD16
PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{ nullptr };
#endif
// state used to advance the PA when Next is called
PFN_PA_FUNC pfnPaNextFunc{ nullptr };
#if ENABLE_AVX512_SIMD16
PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{ nullptr };
#endif
uint32_t nextNumSimdPrims{ 0 };
uint32_t nextNumPrimsIncrement{ 0 };
bool nextReset{ false };
bool isStreaming{ false };
SIMDMASK junkIndices { 0 }; // temporary index store for unused virtual function
PA_STATE_OPT() {}
PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
uint32_t vertexStride, bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
bool HasWork()
{
return (this->numPrimsComplete < this->numPrims) ? true : false;
}
simdvector& GetSimdVector(uint32_t index, uint32_t slot)
{
SWR_ASSERT(slot < vertexStride);
uint32_t offset = index * vertexStride + slot;
simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset];
return vertexSlot;
}
#if ENABLE_AVX512_SIMD16
simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
{
SWR_ASSERT(slot < vertexStride);
uint32_t offset = index * vertexStride + slot;
simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset];
return vertexSlot;
}
#endif
// Assembles 4 triangles. Each simdvector is a single vertex from 4
// triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
bool Assemble(uint32_t slot, simdvector verts[])
{
return this->pfnPaFunc(*this, slot, verts);
}
#if ENABLE_AVX512_SIMD16
bool Assemble_simd16(uint32_t slot, simd16vector verts[])
{
return this->pfnPaFunc_simd16(*this, slot, verts);
}
#endif
// Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
}
bool NextPrim()
{
this->pfnPaFunc = this->pfnPaNextFunc;
#if ENABLE_AVX512_SIMD16
this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
#endif
this->numSimdPrims = this->nextNumSimdPrims;
this->numPrimsComplete += this->nextNumPrimsIncrement;
this->reset = this->nextReset;
if (this->isStreaming)
{
this->reset = false;
}
bool morePrims = false;
if (this->numSimdPrims > 0)
{
morePrims = true;
this->numSimdPrims--;
}
else
{
this->counter = (this->reset) ? 0 : (this->counter + 1);
this->reset = false;
}
if (!HasWork())
{
morePrims = false; // no more to do
}
return morePrims;
}
SIMDVERTEX& GetNextVsOutput()
{
const uint32_t numSimdVerts = streamSizeInVerts / SIMD_WIDTH;
// increment cur and prev indices
if (counter < numSimdVerts)
{
// prev undefined for first state
prev = cur;
cur = counter;
}
else
{
// swap/recycle last two simd verts for prev and cur, leave other simd verts intact in the buffer
uint32_t temp = prev;
prev = cur;
cur = temp;
}
SWR_ASSERT(cur < numSimdVerts);
SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[cur * vertexStride];
return *(SIMDVERTEX*)pVertex;
}
SIMDMASK& GetNextVsIndices()
{
// unused in optimized PA, pass tmp buffer back
return junkIndices;
}
bool GetNextStreamOutput()
{
this->prev = this->cur;
this->cur = this->counter;
return HasWork();
}
uint32_t NumPrims()
{
return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
(SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : SIMD_WIDTH;
}
void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
uint32_t numSimdPrims = 0,
uint32_t numPrimsIncrement = 0,
bool reset = false)
{
this->pfnPaNextFunc = pfnPaNextFunc;
this->nextNumSimdPrims = numSimdPrims;
this->nextNumPrimsIncrement = numPrimsIncrement;
this->nextReset = reset;
this->pfnPaSingleFunc = pfnPaNextSingleFunc;
}
#if ENABLE_AVX512_SIMD16
void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
uint32_t numSimdPrims = 0,
uint32_t numPrimsIncrement = 0,
bool reset = false)
{
this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16;
this->pfnPaNextFunc = pfnPaNextFunc;
this->nextNumSimdPrims = numSimdPrims;
this->nextNumPrimsIncrement = numPrimsIncrement;
this->nextReset = reset;
this->pfnPaSingleFunc = pfnPaNextSingleFunc;
}
#endif
void Reset()
{
#if ENABLE_AVX512_SIMD16
useAlternateOffset = false;
#endif
this->pfnPaFunc = this->pfnPaFuncReset;
#if ENABLE_AVX512_SIMD16
this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16;
#endif
this->numPrimsComplete = 0;
this->numSimdPrims = 0;
this->cur = 0;
this->prev = 0;
this->counter = 0;
this->reset = false;
}
SIMDSCALARI GetPrimID(uint32_t startID)
{
#if USE_SIMD16_FRONTEND
return _simd16_add_epi32(this->primID,
_simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
#else
return _simd_add_epi32(this->primID,
_simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
#endif
}
};
// helper C wrappers to avoid having to rewrite all the PA topology state functions
INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
uint32_t numSimdPrims = 0,
uint32_t numPrimsIncrement = 0,
bool reset = false)
{
return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
}
#if ENABLE_AVX512_SIMD16
INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
uint32_t numSimdPrims = 0,
uint32_t numPrimsIncrement = 0,
bool reset = false)
{
return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
}
#endif
INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
{
return pa.GetSimdVector(index, slot);
}
#if ENABLE_AVX512_SIMD16
INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot)
{
return pa.GetSimdVector_simd16(index, slot);
}
#endif
// Cut-aware primitive assembler.
struct PA_STATE_CUT : public PA_STATE
{
SIMDMASK* pCutIndices{ nullptr }; // cut indices buffer, 1 bit per vertex
uint32_t numVerts{ 0 }; // number of vertices available in buffer store
uint32_t numAttribs{ 0 }; // number of attributes
int32_t numRemainingVerts{ 0 }; // number of verts remaining to be assembled
uint32_t numVertsToAssemble{ 0 }; // total number of verts to assemble for the draw
#if ENABLE_AVX512_SIMD16
OSALIGNSIMD16(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
#else
OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
#endif
SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd
uint32_t numPrimsAssembled{ 0 }; // number of primitives that are fully assembled
uint32_t headVertex{ 0 }; // current unused vertex slot in vertex buffer store
uint32_t tailVertex{ 0 }; // beginning vertex currently assembling
uint32_t curVertex{ 0 }; // current unprocessed vertex
uint32_t startPrimId{ 0 }; // starting prim id
SIMDSCALARI vPrimId; // vector of prim ID
bool needOffsets{ false }; // need to compute gather offsets for current SIMD
uint32_t vertsPerPrim{ 0 };
bool processCutVerts{ false }; // vertex indices with cuts should be processed as normal, otherwise they
// are ignored. Fetch shader sends invalid verts on cuts that should be ignored
// while the GS sends valid verts for every index
simdvector junkVector; // junk simdvector for unimplemented API
#if ENABLE_AVX512_SIMD16
simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
#endif
// Topology state tracking
uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
uint32_t curIndex{ 0 };
bool reverseWinding{ false }; // indicates reverse winding for strips
int32_t adjExtraVert{ 0 }; // extra vert uses for tristrip w/ adj
typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
PFN_PA_FUNC pfnPa{ nullptr }; // per-topology function that processes a single vert
PA_STATE_CUT() {}
PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, SIMDMASK* in_pIndices, uint32_t in_numVerts,
uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
: PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride)
{
numVerts = in_streamSizeInVerts;
numAttribs = in_numAttribs;
binTopology = topo;
needOffsets = false;
processCutVerts = in_processCutVerts;
numVertsToAssemble = numRemainingVerts = in_numVerts;
numPrimsAssembled = 0;
headVertex = tailVertex = curVertex = 0;
curIndex = 0;
pCutIndices = in_pIndices;
memset(indices, 0, sizeof(indices));
#if USE_SIMD16_FRONTEND
vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
#else
vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
#endif
reverseWinding = false;
adjExtraVert = -1;
bool gsEnabled = pDC->pState->state.gsState.gsEnable;
vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
switch (topo)
{
case TOP_TRIANGLE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
case TOP_TRI_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
case TOP_TRIANGLE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
case TOP_TRI_STRIP_ADJ: if (gsEnabled)
{
pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
}
else
{
pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
}
break;
case TOP_POINT_LIST: pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
case TOP_LINE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
case TOP_LINE_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
case TOP_LINE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
case TOP_LISTSTRIP_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
default: assert(0 && "Unimplemented topology");
}
}
SIMDVERTEX& GetNextVsOutput()
{
uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts;
this->needOffsets = true;
SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride];
return *(SIMDVERTEX*)pVertex;
}
SIMDMASK& GetNextVsIndices()
{
uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
return *pCurCutIndex;
}
simdvector& GetSimdVector(uint32_t index, uint32_t slot)
{
// unused
SWR_ASSERT(0 && "Not implemented");
return junkVector;
}
#if ENABLE_AVX512_SIMD16
simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
{
// unused
SWR_ASSERT(0 && "Not implemented");
return junkVector_simd16;
}
#endif
bool GetNextStreamOutput()
{
this->headVertex += SIMD_WIDTH;
this->needOffsets = true;
return HasWork();
}
SIMDSCALARI GetPrimID(uint32_t startID)
{
#if USE_SIMD16_FRONTEND
return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
#else
return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
#endif
}
void Reset()
{
#if ENABLE_AVX512_SIMD16
useAlternateOffset = false;
#endif
this->numRemainingVerts = this->numVertsToAssemble;
this->numPrimsAssembled = 0;
this->curIndex = 0;
this->curVertex = 0;
this->tailVertex = 0;
this->headVertex = 0;
this->reverseWinding = false;
this->adjExtraVert = -1;
#if USE_SIMD16_FRONTEND
this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
#else
this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
#endif
}
bool HasWork()
{
return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
}
bool IsVertexStoreFull()
{
return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
}
void RestartTopology()
{
this->curIndex = 0;
this->reverseWinding = false;
this->adjExtraVert = -1;
}
bool IsCutIndex(uint32_t vertex)
{
uint32_t vertexIndex = vertex / SIMD_WIDTH;
uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
}
// iterates across the unprocessed verts until we hit the end or we
// have assembled SIMD prims
void ProcessVerts()
{
while (this->numPrimsAssembled != SIMD_WIDTH &&
this->numRemainingVerts > 0 &&
this->curVertex != this->headVertex)
{
// if cut index, restart topology
if (IsCutIndex(this->curVertex))
{
if (this->processCutVerts)
{
(this->*pfnPa)(this->curVertex, false);
}
// finish off tri strip w/ adj before restarting topo
if (this->adjExtraVert != -1)
{
(this->*pfnPa)(this->curVertex, true);
}
RestartTopology();
}
else
{
(this->*pfnPa)(this->curVertex, false);
}
this->curVertex++;
if (this->curVertex >= this->numVerts) {
this->curVertex = 0;
}
this->numRemainingVerts--;
}
// special case last primitive for tri strip w/ adj
if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
{
(this->*pfnPa)(this->curVertex, true);
}
}
void Advance()
{
// done with current batch
// advance tail to the current unsubmitted vertex
this->tailVertex = this->curVertex;
this->numPrimsAssembled = 0;
#if USE_SIMD16_FRONTEND
this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
#else
this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
#endif
}
bool NextPrim()
{
// if we've assembled enough prims, we can advance to the next set of verts
if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
{
Advance();
}
return false;
}
void ComputeOffsets()
{
for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
{
uint32_t vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR);
SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0];
// step to simdvertex batch
const uint32_t simdShift = SIMD_WIDTH_LOG2;
#if USE_SIMD16_FRONTEND
SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes));
#else
SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes));
#endif
// step to index
const uint32_t simdMask = SIMD_WIDTH - 1;
#if USE_SIMD16_FRONTEND
SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
this->vOffsets[v] = _simd16_add_epi32(this->vOffsets[v], _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
#else
SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
#endif
}
}
bool Assemble(uint32_t slot, simdvector *verts)
{
// process any outstanding verts
ProcessVerts();
// return false if we don't have enough prims assembled
if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
{
return false;
}
// cache off gather offsets given the current SIMD set of indices the first time we get an assemble
if (this->needOffsets)
{
ComputeOffsets();
this->needOffsets = false;
}
for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
{
SIMDSCALARI offsets = this->vOffsets[v];
// step to attribute
#if USE_SIMD16_FRONTEND
offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
#else
offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
#endif
float* pBase = (float*)this->pStreamBase;
for (uint32_t c = 0; c < 4; ++c)
{
#if USE_SIMD16_FRONTEND
simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
verts[v].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
#else
verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
#endif
// move base to next component
pBase += SIMD_WIDTH;
}
}
return true;
}
#if ENABLE_AVX512_SIMD16
bool Assemble_simd16(uint32_t slot, simd16vector verts[])
{
// process any outstanding verts
ProcessVerts();
// return false if we don't have enough prims assembled
if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
{
return false;
}
// cache off gather offsets given the current SIMD set of indices the first time we get an assemble
if (this->needOffsets)
{
ComputeOffsets();
this->needOffsets = false;
}
for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
{
SIMDSCALARI offsets = this->vOffsets[v];
// step to attribute
#if USE_SIMD16_FRONTEND
offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
#else
offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
#endif
float* pBase = (float*)this->pStreamBase;
for (uint32_t c = 0; c < 4; ++c)
{
#if USE_SIMD16_FRONTEND
verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
#else
verts[v].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0);
#endif
// move base to next component
pBase += SIMD_WIDTH;
}
}
return true;
}
#endif
void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
{
// move to slot
for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
{
uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
#if USE_SIMD16_FRONTEND
uint32_t offset = useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
#else
uint32_t offset = pOffset[triIndex];
#endif
offset += sizeof(SIMDVECTOR) * slot;
float* pVert = (float*)&tri[v];
for (uint32_t c = 0; c < 4; ++c)
{
float* pComponent = (float*)(this->pStreamBase + offset);
pVert[c] = *pComponent;
offset += SIMD_WIDTH * sizeof(float);
}
}
}
uint32_t NumPrims()
{
return this->numPrimsAssembled;
}
// Per-topology functions
void ProcessVertTriStrip(uint32_t index, bool finish)
{
this->vert[this->curIndex] = index;
this->curIndex++;
if (this->curIndex == 3)
{
// assembled enough verts for prim, add to gather indices
this->indices[0][this->numPrimsAssembled] = this->vert[0];
if (reverseWinding)
{
this->indices[1][this->numPrimsAssembled] = this->vert[2];
this->indices[2][this->numPrimsAssembled] = this->vert[1];
}
else
{
this->indices[1][this->numPrimsAssembled] = this->vert[1];
this->indices[2][this->numPrimsAssembled] = this->vert[2];
}
// increment numPrimsAssembled
this->numPrimsAssembled++;
// set up next prim state
this->vert[0] = this->vert[1];
this->vert[1] = this->vert[2];
this->curIndex = 2;
this->reverseWinding ^= 1;
}
}
template<bool gsEnabled>
void AssembleTriStripAdj()
{
if (!gsEnabled)
{
this->vert[1] = this->vert[2];
this->vert[2] = this->vert[4];
this->indices[0][this->numPrimsAssembled] = this->vert[0];
this->indices[1][this->numPrimsAssembled] = this->vert[1];
this->indices[2][this->numPrimsAssembled] = this->vert[2];
this->vert[4] = this->vert[2];
this->vert[2] = this->vert[1];
}
else
{
this->indices[0][this->numPrimsAssembled] = this->vert[0];
this->indices[1][this->numPrimsAssembled] = this->vert[1];
this->indices[2][this->numPrimsAssembled] = this->vert[2];
this->indices[3][this->numPrimsAssembled] = this->vert[3];
this->indices[4][this->numPrimsAssembled] = this->vert[4];
this->indices[5][this->numPrimsAssembled] = this->vert[5];
}
this->numPrimsAssembled++;
}
template<bool gsEnabled>
void ProcessVertTriStripAdj(uint32_t index, bool finish)
{
// handle last primitive of tristrip
if (finish && this->adjExtraVert != -1)
{
this->vert[3] = this->adjExtraVert;
AssembleTriStripAdj<gsEnabled>();
this->adjExtraVert = -1;
return;
}
switch (this->curIndex)
{
case 0:
case 1:
case 2:
case 4:
this->vert[this->curIndex] = index;
this->curIndex++;
break;
case 3:
this->vert[5] = index;
this->curIndex++;
break;
case 5:
if (this->adjExtraVert == -1)
{
this->adjExtraVert = index;
}
else
{
this->vert[3] = index;
if (!gsEnabled)
{
AssembleTriStripAdj<gsEnabled>();
uint32_t nextTri[6];
if (this->reverseWinding)
{
nextTri[0] = this->vert[4];
nextTri[1] = this->vert[0];
nextTri[2] = this->vert[2];
nextTri[4] = this->vert[3];
nextTri[5] = this->adjExtraVert;
}
else
{
nextTri[0] = this->vert[2];
nextTri[1] = this->adjExtraVert;
nextTri[2] = this->vert[3];
nextTri[4] = this->vert[4];
nextTri[5] = this->vert[0];
}
for (uint32_t i = 0; i < 6; ++i)
{
this->vert[i] = nextTri[i];
}
this->adjExtraVert = -1;
this->reverseWinding ^= 1;
}
else
{
this->curIndex++;
}
}
break;
case 6:
SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
AssembleTriStripAdj<gsEnabled>();
uint32_t nextTri[6];
if (this->reverseWinding)
{
nextTri[0] = this->vert[4];
nextTri[1] = this->vert[0];
nextTri[2] = this->vert[2];
nextTri[4] = this->vert[3];
nextTri[5] = this->adjExtraVert;
}
else
{
nextTri[0] = this->vert[2];
nextTri[1] = this->adjExtraVert;
nextTri[2] = this->vert[3];
nextTri[4] = this->vert[4];
nextTri[5] = this->vert[0];
}
for (uint32_t i = 0; i < 6; ++i)
{
this->vert[i] = nextTri[i];
}
this->reverseWinding ^= 1;
this->adjExtraVert = index;
this->curIndex--;
break;
}
}
void ProcessVertTriList(uint32_t index, bool finish)
{
this->vert[this->curIndex] = index;
this->curIndex++;
if (this->curIndex == 3)
{
// assembled enough verts for prim, add to gather indices
this->indices[0][this->numPrimsAssembled] = this->vert[0];
this->indices[1][this->numPrimsAssembled] = this->vert[1];
this->indices[2][this->numPrimsAssembled] = this->vert[2];
// increment numPrimsAssembled
this->numPrimsAssembled++;
// set up next prim state
this->curIndex = 0;
}
}
void ProcessVertTriListAdj(uint32_t index, bool finish)
{
this->vert[this->curIndex] = index;
this->curIndex++;
if (this->curIndex == 6)
{
// assembled enough verts for prim, add to gather indices
this->indices[0][this->numPrimsAssembled] = this->vert[0];
this->indices[1][this->numPrimsAssembled] = this->vert[1];
this->indices[2][this->numPrimsAssembled] = this->vert[2];
this->indices[3][this->numPrimsAssembled] = this->vert[3];
this->indices[4][this->numPrimsAssembled] = this->vert[4];
this->indices[5][this->numPrimsAssembled] = this->vert[5];
// increment numPrimsAssembled
this->numPrimsAssembled++;
// set up next prim state
this->curIndex = 0;
}
}
void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
{
this->vert[this->curIndex] = index;
this->curIndex++;
if (this->curIndex == 6)
{
// assembled enough verts for prim, add to gather indices
this->indices[0][this->numPrimsAssembled] = this->vert[0];
this->indices[1][this->numPrimsAssembled] = this->vert[2];
this->indices[2][this->numPrimsAssembled] = this->vert[4];
// increment numPrimsAssembled
this->numPrimsAssembled++;
// set up next prim state
this->curIndex = 0;
}
}
void ProcessVertLineList(uint32_t index, bool finish)
{
this->vert[this->curIndex] = index;
this->curIndex++;
if (this->curIndex == 2)
{
this->indices[0][this->numPrimsAssembled] = this->vert[0];
this->indices[1][this->numPrimsAssembled] = this->vert[1];
this->numPrimsAssembled++;
this->curIndex = 0;
}
}
void ProcessVertLineStrip(uint32_t index, bool finish)
{
this->vert[this->curIndex] = index;
this->curIndex++;
if (this->curIndex == 2)
{
// assembled enough verts for prim, add to gather indices
this->indices[0][this->numPrimsAssembled] = this->vert[0];
this->indices[1][this->numPrimsAssembled] = this->vert[1];
// increment numPrimsAssembled
this->numPrimsAssembled++;
// set up next prim state
this->vert[0] = this->vert[1];
this->curIndex = 1;
}
}
void ProcessVertLineStripAdj(uint32_t index, bool finish)
{
this->vert[this->curIndex] = index;
this->curIndex++;
if (this->curIndex == 4)
{
// assembled enough verts for prim, add to gather indices
this->indices[0][this->numPrimsAssembled] = this->vert[0];
this->indices[1][this->numPrimsAssembled] = this->vert[1];
this->indices[2][this->numPrimsAssembled] = this->vert[2];
this->indices[3][this->numPrimsAssembled] = this->vert[3];
// increment numPrimsAssembled
this->numPrimsAssembled++;
// set up next prim state
this->vert[0] = this->vert[1];
this->vert[1] = this->vert[2];
this->vert[2] = this->vert[3];
this->curIndex = 3;
}
}
void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
{
this->vert[this->curIndex] = index;
this->curIndex++;
if (this->curIndex == 4)
{
// assembled enough verts for prim, add to gather indices
this->indices[0][this->numPrimsAssembled] = this->vert[1];
this->indices[1][this->numPrimsAssembled] = this->vert[2];
// increment numPrimsAssembled
this->numPrimsAssembled++;
// set up next prim state
this->vert[0] = this->vert[1];
this->vert[1] = this->vert[2];
this->vert[2] = this->vert[3];
this->curIndex = 3;
}
}
void ProcessVertLineListAdj(uint32_t index, bool finish)
{
this->vert[this->curIndex] = index;
this->curIndex++;
if (this->curIndex == 4)
{
this->indices[0][this->numPrimsAssembled] = this->vert[0];
this->indices[1][this->numPrimsAssembled] = this->vert[1];
this->indices[2][this->numPrimsAssembled] = this->vert[2];
this->indices[3][this->numPrimsAssembled] = this->vert[3];
this->numPrimsAssembled++;
this->curIndex = 0;
}
}
void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
{
this->vert[this->curIndex] = index;
this->curIndex++;
if (this->curIndex == 4)
{
this->indices[0][this->numPrimsAssembled] = this->vert[1];
this->indices[1][this->numPrimsAssembled] = this->vert[2];
this->numPrimsAssembled++;
this->curIndex = 0;
}
}
void ProcessVertPointList(uint32_t index, bool finish)
{
this->vert[this->curIndex] = index;
this->curIndex++;
if (this->curIndex == 1)
{
this->indices[0][this->numPrimsAssembled] = this->vert[0];
this->numPrimsAssembled++;
this->curIndex = 0;
}
}
};
// Primitive Assembly for data output from the DomainShader.
struct PA_TESS : PA_STATE
{
PA_TESS(
DRAW_CONTEXT *in_pDC,
const SIMDSCALAR* in_pVertData,
uint32_t in_attributeStrideInVectors,
uint32_t in_vertexStride,
uint32_t in_numAttributes,
uint32_t* (&in_ppIndices)[3],
uint32_t in_numPrims,
PRIMITIVE_TOPOLOGY in_binTopology) :
PA_STATE(in_pDC, nullptr, 0, in_vertexStride),
m_pVertexData(in_pVertData),
m_attributeStrideInVectors(in_attributeStrideInVectors),
m_numAttributes(in_numAttributes),
m_numPrims(in_numPrims)
{
#if USE_SIMD16_FRONTEND
m_vPrimId = _simd16_setzero_si();
#else
m_vPrimId = _simd_setzero_si();
#endif
binTopology = in_binTopology;
m_ppIndices[0] = in_ppIndices[0];
m_ppIndices[1] = in_ppIndices[1];
m_ppIndices[2] = in_ppIndices[2];
switch (binTopology)
{
case TOP_POINT_LIST:
m_numVertsPerPrim = 1;
break;
case TOP_LINE_LIST:
m_numVertsPerPrim = 2;
break;
case TOP_TRIANGLE_LIST:
m_numVertsPerPrim = 3;
break;
default:
SWR_INVALID("Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
break;
}
}
bool HasWork()
{
return m_numPrims != 0;
}
simdvector& GetSimdVector(uint32_t index, uint32_t slot)
{
SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
return junkVector;
}
#if ENABLE_AVX512_SIMD16
simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
{
SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
return junkVector_simd16;
}
#endif
static SIMDSCALARI GenPrimMask(uint32_t numPrims)
{
SWR_ASSERT(numPrims <= SIMD_WIDTH);
#if USE_SIMD16_FRONTEND
static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
{
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
#else
static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
{
-1, -1, -1, -1, -1, -1, -1, -1,
0, 0, 0, 0, 0, 0, 0, 0
};
return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
#endif
}
bool Assemble(uint32_t slot, simdvector verts[])
{
SWR_ASSERT(slot < m_numAttributes);
uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
if (0 == numPrimsToAssemble)
{
return false;
}
SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
{
#if USE_SIMD16_FRONTEND
SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
#else
SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
#endif
const float* pBase = pBaseAttrib;
for (uint32_t c = 0; c < 4; ++c)
{
#if USE_SIMD16_FRONTEND
simd16scalar temp = _simd16_mask_i32gather_ps(
_simd16_setzero_ps(),
pBase,
indices,
_simd16_castsi_ps(mask),
4 /* gcc doesn't like sizeof(float) */);
verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
#else
verts[i].v[c] = _simd_mask_i32gather_ps(
_simd_setzero_ps(),
pBase,
indices,
_simd_castsi_ps(mask),
4); // gcc doesn't like sizeof(float)
#endif
pBase += m_attributeStrideInVectors * SIMD_WIDTH;
}
}
return true;
}
#if ENABLE_AVX512_SIMD16
bool Assemble_simd16(uint32_t slot, simd16vector verts[])
{
SWR_ASSERT(slot < m_numAttributes);
uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
if (0 == numPrimsToAssemble)
{
return false;
}
SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
{
#if USE_SIMD16_FRONTEND
SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
#else
SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
#endif
const float* pBase = pBaseAttrib;
for (uint32_t c = 0; c < 4; ++c)
{
#if USE_SIMD16_FRONTEND
verts[i].v[c] = _simd16_mask_i32gather_ps(
_simd16_setzero_ps(),
pBase,
indices,
_simd16_castsi_ps(mask),
4 /* gcc doesn't like sizeof(float) */);
#else
simdscalar temp = _simd_mask_i32gather_ps(
_simd_setzero_ps(),
pBase,
indices,
_simd_castsi_ps(mask),
4 /* gcc doesn't like sizeof(float) */);
verts[i].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
#endif
pBase += m_attributeStrideInVectors * SIMD_WIDTH;
}
}
return true;
}
#endif
void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
SWR_ASSERT(slot < m_numAttributes);
SWR_ASSERT(primIndex < PA_TESS::NumPrims());
const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
{
#if USE_SIMD16_FRONTEND
uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2] : m_ppIndices[i][primIndex];
#else
uint32_t index = m_ppIndices[i][primIndex];
#endif
const float* pVertData = pVertDataBase;
float* pVert = (float*)&verts[i];
for (uint32_t c = 0; c < 4; ++c)
{
pVert[c] = pVertData[index];
pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
}
}
}
bool NextPrim()
{
uint32_t numPrims = PA_TESS::NumPrims();
m_numPrims -= numPrims;
m_ppIndices[0] += numPrims;
m_ppIndices[1] += numPrims;
m_ppIndices[2] += numPrims;
return HasWork();
}
SIMDVERTEX& GetNextVsOutput()
{
SWR_NOT_IMPL;
return junkVertex;
}
bool GetNextStreamOutput()
{
SWR_NOT_IMPL;
return false;
}
SIMDMASK& GetNextVsIndices()
{
SWR_NOT_IMPL;
return junkIndices;
}
uint32_t NumPrims()
{
return std::min<uint32_t>(m_numPrims, SIMD_WIDTH);
}
void Reset()
{
SWR_NOT_IMPL;
}
SIMDSCALARI GetPrimID(uint32_t startID)
{
#if USE_SIMD16_FRONTEND
return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
#else
return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
#endif
}
private:
const SIMDSCALAR* m_pVertexData = nullptr;
uint32_t m_attributeStrideInVectors = 0;
uint32_t m_numAttributes = 0;
uint32_t m_numPrims = 0;
uint32_t* m_ppIndices[3];
uint32_t m_numVertsPerPrim = 0;
SIMDSCALARI m_vPrimId;
simdvector junkVector; // junk simdvector for unimplemented API
#if ENABLE_AVX512_SIMD16
simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
#endif
SIMDVERTEX junkVertex; // junk SIMDVERTEX for unimplemented API
SIMDMASK junkIndices; // temporary index store for unused virtual function
};
// Primitive Assembler factory class, responsible for creating and initializing the correct assembler
// based on state.
template <typename IsIndexedT, typename IsCutIndexEnabledT>
struct PA_FACTORY
{
PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize, uint32_t vertexStride) : topo(in_topo)
{
#if KNOB_ENABLE_CUT_AWARE_PA == TRUE
const API_STATE& state = GetApiState(pDC);
if ((IsIndexedT::value && IsCutIndexEnabledT::value && (
topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
topo == TOP_TRIANGLE_LIST)) ||
// non-indexed draws with adjacency topologies must use cut-aware PA until we add support
// for them in the optimized PA
(topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))
{
memset(&indexStore, 0, sizeof(indexStore));
uint32_t numAttribs = state.feNumAttributes;
new (&this->paCut) PA_STATE_CUT(pDC, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH,
vertexStride, &this->indexStore[0], numVerts, numAttribs, state.topology, false);
cutPA = true;
}
else
#endif
{
uint32_t numPrims = GetNumPrims(in_topo, numVerts);
new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, vertexStride, false);
cutPA = false;
}
}
PA_STATE& GetPA()
{
#if KNOB_ENABLE_CUT_AWARE_PA == TRUE
if (cutPA)
{
return this->paCut;
}
else
#endif
{
return this->paOpt;
}
}
PA_STATE_OPT paOpt;
PA_STATE_CUT paCut;
bool cutPA{ false };
PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
PA_STATE::SIMDMASK indexStore[MAX_NUM_VERTS_PER_PRIM];
};