src/gallium/drivers/swr/rasterizer/core/pa.h - third_party/mesa - Git at Google

 /****************************************************************************
 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *
 * @file pa.h
 *
 * @brief Definitions for primitive assembly.
 *        N primitives are assembled at a time, where N is the SIMD width.
 *        A state machine, that is specific for a given topology, drives the
 *        assembly of vertices into triangles.
 *
 ******************************************************************************/
 #pragma once

 #include "frontend.h"

 struct PA_STATE
 {
 #if USE_SIMD16_FRONTEND
     enum
     {
         SIMD_WIDTH      = KNOB_SIMD16_WIDTH,
         SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
         SIMD_WIDTH_LOG2 = 4
     };

     typedef         simd16mask          SIMDMASK;

     typedef         simd16scalar        SIMDSCALAR;
     typedef         simd16vector        SIMDVECTOR;
     typedef         simd16vertex        SIMDVERTEX;

     typedef         simd16scalari       SIMDSCALARI;

 #else
     enum
     {
         SIMD_WIDTH      = KNOB_SIMD_WIDTH,
         SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
         SIMD_WIDTH_LOG2 = 3
     };

     typedef         simdmask            SIMDMASK;

     typedef         simdscalar          SIMDSCALAR;
     typedef         simdvector          SIMDVECTOR;
     typedef         simdvertex          SIMDVERTEX;

     typedef         simdscalari         SIMDSCALARI;

 #endif
     DRAW_CONTEXT *pDC{ nullptr };       // draw context
     uint8_t* pStreamBase{ nullptr };    // vertex stream
     uint32_t streamSizeInVerts{ 0 };    // total size of the input stream in verts
     uint32_t vertexStride{ 0 };         // stride of a vertex in simdvector units

     // The topology the binner will use. In some cases the FE changes the topology from the api state.
     PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };

 #if ENABLE_AVX512_SIMD16
     bool useAlternateOffset{ false };

 #endif
     PA_STATE() {}
     PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride) :
         pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), vertexStride(in_vertexStride) {}

     virtual bool HasWork() = 0;
     virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
 #if ENABLE_AVX512_SIMD16
     virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
 #endif
     virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
 #if ENABLE_AVX512_SIMD16
     virtual bool Assemble_simd16(uint32_t slot, simd16vector verts[]) = 0;
 #endif
     virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
     virtual bool NextPrim() = 0;
     virtual SIMDVERTEX& GetNextVsOutput() = 0;
     virtual bool GetNextStreamOutput() = 0;
     virtual SIMDMASK& GetNextVsIndices() = 0;
     virtual uint32_t NumPrims() = 0;
     virtual void Reset() = 0;
     virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0;
 };

 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
 // output. Here is the sequence
 //    1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
 //    2. Execute PA function to assemble and bin triangles.
 //        a.    The PA function is a set of functions that collectively make up the
 //            state machine for a given topology.
 //                1.    We use a state index to track which PA function to call.
 //        b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
 //                1.    We call this the current and previous simd vertex.
 //                2.    The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
 //                    order to assemble the second triangle, for a triangle list, we'll need the
 //                    last vertex from the previous simd and the first 2 vertices from the current simd.
 //                3. At times the PA can assemble multiple triangles from the 2 simd vertices.
 //
 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
 // cuts
 struct PA_STATE_OPT : public PA_STATE
 {
     uint32_t numPrims{ 0 };              // Total number of primitives for draw.
     uint32_t numPrimsComplete{ 0 };      // Total number of complete primitives.

     uint32_t numSimdPrims{ 0 };          // Number of prims in current simd.

     uint32_t cur{ 0 };                   // index to current VS output.
     uint32_t prev{ 0 };                  // index to prev VS output. Not really needed in the state.
     const uint32_t first{ 0 };           // index to first VS output. Used for tri fan and line loop.

     uint32_t counter{ 0 };               // state counter
     bool reset{ false };                 // reset state

     uint32_t primIDIncr{ 0 };            // how much to increment for each vector (typically vector / {1, 2})
     SIMDSCALARI primID;

     typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 #if ENABLE_AVX512_SIMD16
     typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
     typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);

     PFN_PA_FUNC        pfnPaFunc{ nullptr };        // PA state machine function for assembling 4 triangles.
 #if ENABLE_AVX512_SIMD16
     PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{ nullptr };
 #endif
     PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr };  // PA state machine function for assembling single triangle.
     PFN_PA_FUNC        pfnPaFuncReset{ nullptr };   // initial state to set on reset
 #if ENABLE_AVX512_SIMD16
     PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{ nullptr };
 #endif

     // state used to advance the PA when Next is called
     PFN_PA_FUNC        pfnPaNextFunc{ nullptr };
 #if ENABLE_AVX512_SIMD16
     PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{ nullptr };
 #endif
     uint32_t           nextNumSimdPrims{ 0 };
     uint32_t           nextNumPrimsIncrement{ 0 };
     bool               nextReset{ false };
     bool               isStreaming{ false };

     SIMDMASK           junkIndices  { 0 };          // temporary index store for unused virtual function

     PA_STATE_OPT() {}
     PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
         uint32_t vertexStride, bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);

     bool HasWork()
     {
         return (this->numPrimsComplete < this->numPrims) ? true : false;
     }

     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
     {
         SWR_ASSERT(slot < vertexStride);
         uint32_t offset = index * vertexStride + slot;
         simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset];
         return vertexSlot;
     }

 #if ENABLE_AVX512_SIMD16
     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
     {
         SWR_ASSERT(slot < vertexStride);
         uint32_t offset = index * vertexStride + slot;
         simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset];
         return vertexSlot;
     }

 #endif
     // Assembles 4 triangles. Each simdvector is a single vertex from 4
     // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
     bool Assemble(uint32_t slot, simdvector verts[])
     {
         return this->pfnPaFunc(*this, slot, verts);
     }

 #if ENABLE_AVX512_SIMD16
     bool Assemble_simd16(uint32_t slot, simd16vector verts[])
     {
         return this->pfnPaFunc_simd16(*this, slot, verts);
     }

 #endif
     // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
     void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
     {
         return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
     }

     bool NextPrim()
     {
         this->pfnPaFunc = this->pfnPaNextFunc;
 #if ENABLE_AVX512_SIMD16
         this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
 #endif
         this->numSimdPrims = this->nextNumSimdPrims;
         this->numPrimsComplete += this->nextNumPrimsIncrement;
         this->reset = this->nextReset;

         if (this->isStreaming)
         {
             this->reset = false;
         }

         bool morePrims = false;

         if (this->numSimdPrims > 0)
         {
             morePrims = true;
             this->numSimdPrims--;
         }
         else
         {
             this->counter = (this->reset) ? 0 : (this->counter + 1);
             this->reset = false;
         }

         if (!HasWork())
         {
             morePrims = false;    // no more to do
         }

         return morePrims;
     }

     SIMDVERTEX& GetNextVsOutput()
     {
         const uint32_t numSimdVerts = streamSizeInVerts / SIMD_WIDTH;

         // increment cur and prev indices
         if (counter < numSimdVerts)
         {
             // prev undefined for first state
             prev = cur;
             cur = counter;
         }
         else
         {
             // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in the buffer
             uint32_t temp = prev;

             prev = cur;
             cur = temp;
         }

         SWR_ASSERT(cur < numSimdVerts);
         SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[cur * vertexStride];

         return *(SIMDVERTEX*)pVertex;
     }

     SIMDMASK& GetNextVsIndices()
     {
         // unused in optimized PA, pass tmp buffer back
         return junkIndices;
     }

     bool GetNextStreamOutput()
     {
         this->prev = this->cur;
         this->cur = this->counter;

         return HasWork();
     }

     uint32_t NumPrims()
     {
         return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
             (SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : SIMD_WIDTH;
     }

     void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
         PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
         uint32_t numSimdPrims = 0,
         uint32_t numPrimsIncrement = 0,
         bool reset = false)
     {
         this->pfnPaNextFunc = pfnPaNextFunc;
         this->nextNumSimdPrims = numSimdPrims;
         this->nextNumPrimsIncrement = numPrimsIncrement;
         this->nextReset = reset;

         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
     }

 #if ENABLE_AVX512_SIMD16
     void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
         PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
         PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
         uint32_t numSimdPrims = 0,
         uint32_t numPrimsIncrement = 0,
         bool reset = false)
     {
         this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16;
         this->pfnPaNextFunc = pfnPaNextFunc;
         this->nextNumSimdPrims = numSimdPrims;
         this->nextNumPrimsIncrement = numPrimsIncrement;
         this->nextReset = reset;

         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
     }

 #endif
     void Reset()
     {
 #if ENABLE_AVX512_SIMD16
         useAlternateOffset = false;

 #endif
         this->pfnPaFunc = this->pfnPaFuncReset;
 #if ENABLE_AVX512_SIMD16
         this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16;
 #endif
         this->numPrimsComplete = 0;
         this->numSimdPrims = 0;
         this->cur = 0;
         this->prev = 0;
         this->counter = 0;
         this->reset = false;
     }

     SIMDSCALARI GetPrimID(uint32_t startID)
     {
 #if USE_SIMD16_FRONTEND
         return _simd16_add_epi32(this->primID,
             _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
 #else
         return _simd_add_epi32(this->primID,
             _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
 #endif
     }
 };

 // helper C wrappers to avoid having to rewrite all the PA topology state functions
 INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
     PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
     uint32_t numSimdPrims = 0,
     uint32_t numPrimsIncrement = 0,
     bool reset = false)
 {
     return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
 }

 #if ENABLE_AVX512_SIMD16
 INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
     PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
     PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
     uint32_t numSimdPrims = 0,
     uint32_t numPrimsIncrement = 0,
     bool reset = false)
 {
     return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
 }

 #endif
 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
 {
     return pa.GetSimdVector(index, slot);
 }

 #if ENABLE_AVX512_SIMD16
 INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot)
 {
     return pa.GetSimdVector_simd16(index, slot);
 }

 #endif
 // Cut-aware primitive assembler.
 struct PA_STATE_CUT : public PA_STATE
 {
     SIMDMASK* pCutIndices{ nullptr };    // cut indices buffer, 1 bit per vertex
     uint32_t numVerts{ 0 };              // number of vertices available in buffer store
     uint32_t numAttribs{ 0 };            // number of attributes
     int32_t numRemainingVerts{ 0 };      // number of verts remaining to be assembled
     uint32_t numVertsToAssemble{ 0 };    // total number of verts to assemble for the draw
 #if ENABLE_AVX512_SIMD16
     OSALIGNSIMD16(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH];    // current index buffer for gather
 #else
     OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH];    // current index buffer for gather
 #endif
     SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM];           // byte offsets for currently assembling simd
     uint32_t numPrimsAssembled{ 0 };     // number of primitives that are fully assembled
     uint32_t headVertex{ 0 };            // current unused vertex slot in vertex buffer store
     uint32_t tailVertex{ 0 };            // beginning vertex currently assembling
     uint32_t curVertex{ 0 };             // current unprocessed vertex
     uint32_t startPrimId{ 0 };           // starting prim id
     SIMDSCALARI vPrimId;                 // vector of prim ID
     bool needOffsets{ false };           // need to compute gather offsets for current SIMD
     uint32_t vertsPerPrim{ 0 };
     bool processCutVerts{ false };       // vertex indices with cuts should be processed as normal, otherwise they
                                          // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
                                          // while the GS sends valid verts for every index

     simdvector      junkVector;          // junk simdvector for unimplemented API
 #if ENABLE_AVX512_SIMD16
     simd16vector    junkVector_simd16;   // junk simd16vector for unimplemented API
 #endif

     // Topology state tracking
     uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
     uint32_t curIndex{ 0 };
     bool reverseWinding{ false };        // indicates reverse winding for strips
     int32_t adjExtraVert{ 0 };           // extra vert uses for tristrip w/ adj

     typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
     PFN_PA_FUNC pfnPa{ nullptr };        // per-topology function that processes a single vert

     PA_STATE_CUT() {}
     PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, SIMDMASK* in_pIndices, uint32_t in_numVerts,
         uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
         : PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride)
     {
         numVerts = in_streamSizeInVerts;
         numAttribs = in_numAttribs;
         binTopology = topo;
         needOffsets = false;
         processCutVerts = in_processCutVerts;

         numVertsToAssemble = numRemainingVerts = in_numVerts;
         numPrimsAssembled = 0;
         headVertex = tailVertex = curVertex = 0;

         curIndex = 0;
         pCutIndices = in_pIndices;
         memset(indices, 0, sizeof(indices));
 #if USE_SIMD16_FRONTEND
         vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 #else
         vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
 #endif
         reverseWinding = false;
         adjExtraVert = -1;

         bool gsEnabled = pDC->pState->state.gsState.gsEnable;
         vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);

         switch (topo)
         {
         case TOP_TRIANGLE_LIST:     pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
         case TOP_TRI_LIST_ADJ:      pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
         case TOP_TRIANGLE_STRIP:    pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
         case TOP_TRI_STRIP_ADJ:     if (gsEnabled)
                                     {
                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
                                     }
                                     else
                                     {
                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
                                     }
                                     break;

         case TOP_POINT_LIST:        pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
         case TOP_LINE_LIST:         pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
         case TOP_LINE_LIST_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
         case TOP_LINE_STRIP:        pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
         case TOP_LISTSTRIP_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
         default: assert(0 && "Unimplemented topology");
         }
     }

     SIMDVERTEX& GetNextVsOutput()
     {
         uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
         this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts;
         this->needOffsets = true;
         SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride];

         return *(SIMDVERTEX*)pVertex;
     }

     SIMDMASK& GetNextVsIndices()
     {
         uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
         SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
         return *pCurCutIndex;
     }

     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
     {
         // unused
         SWR_ASSERT(0 && "Not implemented");
         return junkVector;
     }

 #if ENABLE_AVX512_SIMD16
     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
     {
         // unused
         SWR_ASSERT(0 && "Not implemented");
         return junkVector_simd16;
     }

 #endif
     bool GetNextStreamOutput()
     {
         this->headVertex += SIMD_WIDTH;
         this->needOffsets = true;
         return HasWork();
     }

     SIMDSCALARI GetPrimID(uint32_t startID)
     {
 #if USE_SIMD16_FRONTEND
         return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
 #else
         return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
 #endif
     }

     void Reset()
     {
 #if ENABLE_AVX512_SIMD16
         useAlternateOffset = false;

 #endif
         this->numRemainingVerts = this->numVertsToAssemble;
         this->numPrimsAssembled = 0;
         this->curIndex = 0;
         this->curVertex = 0;
         this->tailVertex = 0;
         this->headVertex = 0;
         this->reverseWinding = false;
         this->adjExtraVert = -1;
 #if USE_SIMD16_FRONTEND
         this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 #else
         this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
 #endif
     }

     bool HasWork()
     {
         return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
     }

     bool IsVertexStoreFull()
     {
         return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
     }

     void RestartTopology()
     {
         this->curIndex = 0;
         this->reverseWinding = false;
         this->adjExtraVert = -1;
     }

     bool IsCutIndex(uint32_t vertex)
     {
         uint32_t vertexIndex = vertex / SIMD_WIDTH;
         uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
         return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
     }

     // iterates across the unprocessed verts until we hit the end or we
     // have assembled SIMD prims
     void ProcessVerts()
     {
         while (this->numPrimsAssembled != SIMD_WIDTH &&
             this->numRemainingVerts > 0 &&
             this->curVertex != this->headVertex)
         {
             // if cut index, restart topology
             if (IsCutIndex(this->curVertex))
             {
                 if (this->processCutVerts)
                 {
                     (this->*pfnPa)(this->curVertex, false);
                 }
                 // finish off tri strip w/ adj before restarting topo
                 if (this->adjExtraVert != -1)
                 {
                     (this->*pfnPa)(this->curVertex, true);
                 }
                 RestartTopology();
             }
             else
             {
                 (this->*pfnPa)(this->curVertex, false);
             }

             this->curVertex++;
             if (this->curVertex >= this->numVerts) {
                this->curVertex = 0;
             }
             this->numRemainingVerts--;
         }

         // special case last primitive for tri strip w/ adj
         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
         {
             (this->*pfnPa)(this->curVertex, true);
         }
     }

     void Advance()
     {
         // done with current batch
         // advance tail to the current unsubmitted vertex
         this->tailVertex = this->curVertex;
         this->numPrimsAssembled = 0;
 #if USE_SIMD16_FRONTEND
         this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
 #else
         this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
 #endif
     }

     bool NextPrim()
     {
         // if we've assembled enough prims, we can advance to the next set of verts
         if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
         {
             Advance();
         }
         return false;
     }

     void ComputeOffsets()
     {
         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
         {
             uint32_t vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR);
             SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0];

             // step to simdvertex batch
             const uint32_t simdShift = SIMD_WIDTH_LOG2;
 #if USE_SIMD16_FRONTEND
             SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
             this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes));
 #else
             SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
             this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes));
 #endif

             // step to index
             const uint32_t simdMask = SIMD_WIDTH - 1;
 #if USE_SIMD16_FRONTEND
             SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
             this->vOffsets[v] = _simd16_add_epi32(this->vOffsets[v], _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
 #else
             SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
             this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
 #endif
         }
     }

     bool Assemble(uint32_t slot, simdvector *verts)
     {
         // process any outstanding verts
         ProcessVerts();

         // return false if we don't have enough prims assembled
         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
         {
             return false;
         }

         // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
         if (this->needOffsets)
         {
             ComputeOffsets();
             this->needOffsets = false;
         }

         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
         {
             SIMDSCALARI offsets = this->vOffsets[v];

             // step to attribute
 #if USE_SIMD16_FRONTEND
             offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
 #else
             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
 #endif

             float* pBase = (float*)this->pStreamBase;
             for (uint32_t c = 0; c < 4; ++c)
             {
 #if USE_SIMD16_FRONTEND
                 simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);

                 verts[v].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
 #else
                 verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
 #endif

                 // move base to next component
                 pBase += SIMD_WIDTH;
             }
         }

         return true;
     }

 #if ENABLE_AVX512_SIMD16
     bool Assemble_simd16(uint32_t slot, simd16vector verts[])
     {
         // process any outstanding verts
         ProcessVerts();

         // return false if we don't have enough prims assembled
         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
         {
             return false;
         }

         // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
         if (this->needOffsets)
         {
             ComputeOffsets();
             this->needOffsets = false;
         }

         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
         {
             SIMDSCALARI offsets = this->vOffsets[v];

             // step to attribute
 #if USE_SIMD16_FRONTEND
             offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
 #else
             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
 #endif

             float* pBase = (float*)this->pStreamBase;
             for (uint32_t c = 0; c < 4; ++c)
             {
 #if USE_SIMD16_FRONTEND
                 verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
 #else
                 verts[v].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0);
 #endif

                 // move base to next component
                 pBase += SIMD_WIDTH;
             }
         }

         return true;
     }

 #endif
     void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
     {
         // move to slot
         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
         {
             uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
 #if USE_SIMD16_FRONTEND
             uint32_t offset = useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
 #else
             uint32_t offset = pOffset[triIndex];
 #endif
             offset += sizeof(SIMDVECTOR) * slot;
             float* pVert = (float*)&tri[v];
             for (uint32_t c = 0; c < 4; ++c)
             {
                 float* pComponent = (float*)(this->pStreamBase + offset);
                 pVert[c] = *pComponent;
                 offset += SIMD_WIDTH * sizeof(float);
             }
         }
     }

     uint32_t NumPrims()
     {
         return this->numPrimsAssembled;
     }

     // Per-topology functions
     void ProcessVertTriStrip(uint32_t index, bool finish)
     {
         this->vert[this->curIndex] = index;
         this->curIndex++;
         if (this->curIndex == 3)
         {
             // assembled enough verts for prim, add to gather indices
             this->indices[0][this->numPrimsAssembled] = this->vert[0];
             if (reverseWinding)
             {
                 this->indices[1][this->numPrimsAssembled] = this->vert[2];
                 this->indices[2][this->numPrimsAssembled] = this->vert[1];
             }
             else
             {
                 this->indices[1][this->numPrimsAssembled] = this->vert[1];
                 this->indices[2][this->numPrimsAssembled] = this->vert[2];
             }

             // increment numPrimsAssembled
             this->numPrimsAssembled++;

             // set up next prim state
             this->vert[0] = this->vert[1];
             this->vert[1] = this->vert[2];
             this->curIndex = 2;
             this->reverseWinding ^= 1;
         }
     }

     template<bool gsEnabled>
     void AssembleTriStripAdj()
     {
         if (!gsEnabled)
         {
             this->vert[1] = this->vert[2];
             this->vert[2] = this->vert[4];

             this->indices[0][this->numPrimsAssembled] = this->vert[0];
             this->indices[1][this->numPrimsAssembled] = this->vert[1];
             this->indices[2][this->numPrimsAssembled] = this->vert[2];

             this->vert[4] = this->vert[2];
             this->vert[2] = this->vert[1];
         }
         else
         {
             this->indices[0][this->numPrimsAssembled] = this->vert[0];
             this->indices[1][this->numPrimsAssembled] = this->vert[1];
             this->indices[2][this->numPrimsAssembled] = this->vert[2];
             this->indices[3][this->numPrimsAssembled] = this->vert[3];
             this->indices[4][this->numPrimsAssembled] = this->vert[4];
             this->indices[5][this->numPrimsAssembled] = this->vert[5];
         }
         this->numPrimsAssembled++;
     }


     template<bool gsEnabled>
     void ProcessVertTriStripAdj(uint32_t index, bool finish)
     {
         // handle last primitive of tristrip
         if (finish && this->adjExtraVert != -1)
         {
             this->vert[3] = this->adjExtraVert;
             AssembleTriStripAdj<gsEnabled>();
             this->adjExtraVert = -1;
             return;
         }

         switch (this->curIndex)
         {
         case 0:
         case 1:
         case 2:
         case 4:
             this->vert[this->curIndex] = index;
             this->curIndex++;
             break;
         case 3:
             this->vert[5] = index;
             this->curIndex++;
             break;
         case 5:
             if (this->adjExtraVert == -1)
             {
                 this->adjExtraVert = index;
             }
             else
             {
                 this->vert[3] = index;
                 if (!gsEnabled)
                 {
                     AssembleTriStripAdj<gsEnabled>();

                     uint32_t nextTri[6];
                     if (this->reverseWinding)
                     {
                         nextTri[0] = this->vert[4];
                         nextTri[1] = this->vert[0];
                         nextTri[2] = this->vert[2];
                         nextTri[4] = this->vert[3];
                         nextTri[5] = this->adjExtraVert;
                     }
                     else
                     {
                         nextTri[0] = this->vert[2];
                         nextTri[1] = this->adjExtraVert;
                         nextTri[2] = this->vert[3];
                         nextTri[4] = this->vert[4];
                         nextTri[5] = this->vert[0];
                     }
                     for (uint32_t i = 0; i < 6; ++i)
                     {
                         this->vert[i] = nextTri[i];
                     }

                     this->adjExtraVert = -1;
                     this->reverseWinding ^= 1;
                 }
                 else
                 {
                     this->curIndex++;
                 }
             }
             break;
         case 6:
             SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
             AssembleTriStripAdj<gsEnabled>();

             uint32_t nextTri[6];
             if (this->reverseWinding)
             {
                 nextTri[0] = this->vert[4];
                 nextTri[1] = this->vert[0];
                 nextTri[2] = this->vert[2];
                 nextTri[4] = this->vert[3];
                 nextTri[5] = this->adjExtraVert;
             }
             else
             {
                 nextTri[0] = this->vert[2];
                 nextTri[1] = this->adjExtraVert;
                 nextTri[2] = this->vert[3];
                 nextTri[4] = this->vert[4];
                 nextTri[5] = this->vert[0];
             }
             for (uint32_t i = 0; i < 6; ++i)
             {
                 this->vert[i] = nextTri[i];
             }
             this->reverseWinding ^= 1;
             this->adjExtraVert = index;
             this->curIndex--;
             break;
         }
     }

     void ProcessVertTriList(uint32_t index, bool finish)
     {
         this->vert[this->curIndex] = index;
         this->curIndex++;
         if (this->curIndex == 3)
         {
             // assembled enough verts for prim, add to gather indices
             this->indices[0][this->numPrimsAssembled] = this->vert[0];
             this->indices[1][this->numPrimsAssembled] = this->vert[1];
             this->indices[2][this->numPrimsAssembled] = this->vert[2];

             // increment numPrimsAssembled
             this->numPrimsAssembled++;

             // set up next prim state
             this->curIndex = 0;
         }
     }

     void ProcessVertTriListAdj(uint32_t index, bool finish)
     {
         this->vert[this->curIndex] = index;
         this->curIndex++;
         if (this->curIndex == 6)
         {
             // assembled enough verts for prim, add to gather indices
             this->indices[0][this->numPrimsAssembled] = this->vert[0];
             this->indices[1][this->numPrimsAssembled] = this->vert[1];
             this->indices[2][this->numPrimsAssembled] = this->vert[2];
             this->indices[3][this->numPrimsAssembled] = this->vert[3];
             this->indices[4][this->numPrimsAssembled] = this->vert[4];
             this->indices[5][this->numPrimsAssembled] = this->vert[5];

             // increment numPrimsAssembled
             this->numPrimsAssembled++;

             // set up next prim state
             this->curIndex = 0;
         }
     }

     void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
     {
         this->vert[this->curIndex] = index;
         this->curIndex++;
         if (this->curIndex == 6)
         {
             // assembled enough verts for prim, add to gather indices
             this->indices[0][this->numPrimsAssembled] = this->vert[0];
             this->indices[1][this->numPrimsAssembled] = this->vert[2];
             this->indices[2][this->numPrimsAssembled] = this->vert[4];

             // increment numPrimsAssembled
             this->numPrimsAssembled++;

             // set up next prim state
             this->curIndex = 0;
         }
     }


     void ProcessVertLineList(uint32_t index, bool finish)
     {
         this->vert[this->curIndex] = index;
         this->curIndex++;
         if (this->curIndex == 2)
         {
             this->indices[0][this->numPrimsAssembled] = this->vert[0];
             this->indices[1][this->numPrimsAssembled] = this->vert[1];

             this->numPrimsAssembled++;
             this->curIndex = 0;
         }
     }

     void ProcessVertLineStrip(uint32_t index, bool finish)
     {
         this->vert[this->curIndex] = index;
         this->curIndex++;
         if (this->curIndex == 2)
         {
             // assembled enough verts for prim, add to gather indices
             this->indices[0][this->numPrimsAssembled] = this->vert[0];
             this->indices[1][this->numPrimsAssembled] = this->vert[1];

             // increment numPrimsAssembled
             this->numPrimsAssembled++;

             // set up next prim state
             this->vert[0] = this->vert[1];
             this->curIndex = 1;
         }
     }

     void ProcessVertLineStripAdj(uint32_t index, bool finish)
     {
         this->vert[this->curIndex] = index;
         this->curIndex++;
         if (this->curIndex == 4)
         {
             // assembled enough verts for prim, add to gather indices
             this->indices[0][this->numPrimsAssembled] = this->vert[0];
             this->indices[1][this->numPrimsAssembled] = this->vert[1];
             this->indices[2][this->numPrimsAssembled] = this->vert[2];
             this->indices[3][this->numPrimsAssembled] = this->vert[3];

             // increment numPrimsAssembled
             this->numPrimsAssembled++;

             // set up next prim state
             this->vert[0] = this->vert[1];
             this->vert[1] = this->vert[2];
             this->vert[2] = this->vert[3];
             this->curIndex = 3;
         }
     }

     void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
     {
         this->vert[this->curIndex] = index;
         this->curIndex++;
         if (this->curIndex == 4)
         {
             // assembled enough verts for prim, add to gather indices
             this->indices[0][this->numPrimsAssembled] = this->vert[1];
             this->indices[1][this->numPrimsAssembled] = this->vert[2];

             // increment numPrimsAssembled
             this->numPrimsAssembled++;

             // set up next prim state
             this->vert[0] = this->vert[1];
             this->vert[1] = this->vert[2];
             this->vert[2] = this->vert[3];
             this->curIndex = 3;
         }
     }

     void ProcessVertLineListAdj(uint32_t index, bool finish)
     {
         this->vert[this->curIndex] = index;
         this->curIndex++;
         if (this->curIndex == 4)
         {
             this->indices[0][this->numPrimsAssembled] = this->vert[0];
             this->indices[1][this->numPrimsAssembled] = this->vert[1];
             this->indices[2][this->numPrimsAssembled] = this->vert[2];
             this->indices[3][this->numPrimsAssembled] = this->vert[3];

             this->numPrimsAssembled++;
             this->curIndex = 0;
         }
     }

     void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
     {
         this->vert[this->curIndex] = index;
         this->curIndex++;
         if (this->curIndex == 4)
         {
             this->indices[0][this->numPrimsAssembled] = this->vert[1];
             this->indices[1][this->numPrimsAssembled] = this->vert[2];

             this->numPrimsAssembled++;
             this->curIndex = 0;
         }
     }

     void ProcessVertPointList(uint32_t index, bool finish)
     {
         this->vert[this->curIndex] = index;
         this->curIndex++;
         if (this->curIndex == 1)
         {
             this->indices[0][this->numPrimsAssembled] = this->vert[0];
             this->numPrimsAssembled++;
             this->curIndex = 0;
         }
     }
 };

 // Primitive Assembly for data output from the DomainShader.
 struct PA_TESS : PA_STATE
 {
     PA_TESS(
         DRAW_CONTEXT *in_pDC,
         const SIMDSCALAR* in_pVertData,
         uint32_t in_attributeStrideInVectors,
         uint32_t in_vertexStride,
         uint32_t in_numAttributes,
         uint32_t* (&in_ppIndices)[3],
         uint32_t in_numPrims,
         PRIMITIVE_TOPOLOGY in_binTopology) :

         PA_STATE(in_pDC, nullptr, 0, in_vertexStride),
         m_pVertexData(in_pVertData),
         m_attributeStrideInVectors(in_attributeStrideInVectors),
         m_numAttributes(in_numAttributes),
         m_numPrims(in_numPrims)
     {
 #if USE_SIMD16_FRONTEND
         m_vPrimId = _simd16_setzero_si();
 #else
         m_vPrimId = _simd_setzero_si();
 #endif
         binTopology = in_binTopology;
         m_ppIndices[0] = in_ppIndices[0];
         m_ppIndices[1] = in_ppIndices[1];
         m_ppIndices[2] = in_ppIndices[2];

         switch (binTopology)
         {
         case TOP_POINT_LIST:
             m_numVertsPerPrim = 1;
             break;

         case TOP_LINE_LIST:
             m_numVertsPerPrim = 2;
             break;

         case TOP_TRIANGLE_LIST:
             m_numVertsPerPrim = 3;
             break;

         default:
             SWR_INVALID("Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
             break;
         }
     }

     bool HasWork()
     {
         return m_numPrims != 0;
     }

     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
     {
         SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
         return junkVector;
     }

 #if ENABLE_AVX512_SIMD16
     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
     {
         SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
         return junkVector_simd16;
     }

 #endif
     static SIMDSCALARI GenPrimMask(uint32_t numPrims)
     {
         SWR_ASSERT(numPrims <= SIMD_WIDTH);
 #if USE_SIMD16_FRONTEND
         static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
         {
             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
         };

         return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
 #else
         static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
         {
             -1, -1, -1, -1, -1, -1, -1, -1,
             0,  0,  0,  0,  0,  0,  0,  0
         };

         return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
 #endif
     }

     bool Assemble(uint32_t slot, simdvector verts[])
     {
         SWR_ASSERT(slot < m_numAttributes);

         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
         if (0 == numPrimsToAssemble)
         {
             return false;
         }

         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);

         const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
         {
 #if USE_SIMD16_FRONTEND
             SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
 #else
             SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
 #endif

             const float* pBase = pBaseAttrib;
             for (uint32_t c = 0; c < 4; ++c)
             {
 #if USE_SIMD16_FRONTEND
                 simd16scalar temp = _simd16_mask_i32gather_ps(
                     _simd16_setzero_ps(),
                     pBase,
                     indices,
                     _simd16_castsi_ps(mask),
                     4 /* gcc doesn't like sizeof(float) */);

                 verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
 #else
                 verts[i].v[c] = _simd_mask_i32gather_ps(
                     _simd_setzero_ps(),
                     pBase,
                     indices,
                     _simd_castsi_ps(mask),
                     4); // gcc doesn't like sizeof(float)
 #endif
                 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
             }
         }

         return true;
     }

 #if ENABLE_AVX512_SIMD16
     bool Assemble_simd16(uint32_t slot, simd16vector verts[])
     {
         SWR_ASSERT(slot < m_numAttributes);

         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
         if (0 == numPrimsToAssemble)
         {
             return false;
         }

         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);

         const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
         {
 #if USE_SIMD16_FRONTEND
             SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
 #else
             SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
 #endif

             const float* pBase = pBaseAttrib;
             for (uint32_t c = 0; c < 4; ++c)
             {
 #if USE_SIMD16_FRONTEND
                 verts[i].v[c] = _simd16_mask_i32gather_ps(
                     _simd16_setzero_ps(),
                     pBase,
                     indices,
                     _simd16_castsi_ps(mask),
                     4 /* gcc doesn't like sizeof(float) */);
 #else
                 simdscalar temp = _simd_mask_i32gather_ps(
                     _simd_setzero_ps(),
                     pBase,
                     indices,
                     _simd_castsi_ps(mask),
                     4 /* gcc doesn't like sizeof(float) */);
                 verts[i].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
 #endif
                 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
             }
         }

         return true;
     }

 #endif
     void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
     {
         SWR_ASSERT(slot < m_numAttributes);
         SWR_ASSERT(primIndex < PA_TESS::NumPrims());

         const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
         {
 #if USE_SIMD16_FRONTEND
             uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2] : m_ppIndices[i][primIndex];
 #else
             uint32_t index = m_ppIndices[i][primIndex];
 #endif
             const float* pVertData = pVertDataBase;
             float* pVert = (float*)&verts[i];

             for (uint32_t c = 0; c < 4; ++c)
             {
                 pVert[c] = pVertData[index];
                 pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
             }
         }
     }

     bool NextPrim()
     {
         uint32_t numPrims = PA_TESS::NumPrims();
         m_numPrims -= numPrims;
         m_ppIndices[0] += numPrims;
         m_ppIndices[1] += numPrims;
         m_ppIndices[2] += numPrims;

         return HasWork();
     }

     SIMDVERTEX& GetNextVsOutput()
     {
         SWR_NOT_IMPL;
         return junkVertex;
     }

     bool GetNextStreamOutput()
     {
         SWR_NOT_IMPL;
         return false;
     }

     SIMDMASK& GetNextVsIndices()
     {
         SWR_NOT_IMPL;
         return junkIndices;
     }

     uint32_t NumPrims()
     {
         return std::min<uint32_t>(m_numPrims, SIMD_WIDTH);
     }

     void Reset()
     {
         SWR_NOT_IMPL;
     }

     SIMDSCALARI GetPrimID(uint32_t startID)
     {
 #if USE_SIMD16_FRONTEND
         return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
 #else
         return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
 #endif
     }

 private:
     const SIMDSCALAR*   m_pVertexData = nullptr;
     uint32_t            m_attributeStrideInVectors = 0;
     uint32_t            m_numAttributes = 0;
     uint32_t            m_numPrims = 0;
     uint32_t*           m_ppIndices[3];

     uint32_t            m_numVertsPerPrim = 0;

     SIMDSCALARI         m_vPrimId;

     simdvector          junkVector;         // junk simdvector for unimplemented API
 #if ENABLE_AVX512_SIMD16
     simd16vector        junkVector_simd16;  // junk simd16vector for unimplemented API
 #endif
     SIMDVERTEX          junkVertex;         // junk SIMDVERTEX for unimplemented API
     SIMDMASK            junkIndices;        // temporary index store for unused virtual function
 };

 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
 // based on state.
 template <typename IsIndexedT, typename IsCutIndexEnabledT>
 struct PA_FACTORY
 {
     PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize, uint32_t vertexStride) : topo(in_topo)
     {
 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
         const API_STATE& state = GetApiState(pDC);
         if ((IsIndexedT::value && IsCutIndexEnabledT::value && (
             topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
             topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
             topo == TOP_TRIANGLE_LIST)) ||

             // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
             // for them in the optimized PA
             (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))
         {
             memset(&indexStore, 0, sizeof(indexStore));
             uint32_t numAttribs = state.feNumAttributes;

             new (&this->paCut) PA_STATE_CUT(pDC, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH,
                 vertexStride, &this->indexStore[0], numVerts, numAttribs, state.topology, false);
             cutPA = true;
         }
         else
 #endif
         {
             uint32_t numPrims = GetNumPrims(in_topo, numVerts);
             new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, vertexStride, false);
             cutPA = false;
         }

     }

     PA_STATE& GetPA()
     {
 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
         if (cutPA)
         {
             return this->paCut;
         }
         else
 #endif
         {
             return this->paOpt;
         }
     }

     PA_STATE_OPT paOpt;
     PA_STATE_CUT paCut;

     bool cutPA{ false };

     PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };

     PA_STATE::SIMDMASK      indexStore[MAX_NUM_VERTS_PER_PRIM];
 };