blob: 0f0a43e017c0722f80b0a85f4de3d1dfdd39beaa [file] [log] [blame]
// Copyright 2019 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
//
//
#include "raster_builder_impl.h"
#include <memory.h>
#include <stdlib.h>
#include "block_pool.h"
#include "common/macros.h"
#include "common/vk/assert.h"
#include "common/vk/barrier.h"
#include "flush.h"
#include "handle_pool.h"
#include "path_builder.h"
#include "path_builder_impl.h"
#include "queue_pool.h"
#include "ring.h"
#include "shaders/push.h"
#include "spinel/spinel_assert.h"
#include "weakref.h"
//
// The raster builder prepares fill commands, transforms and clips for the
// rasterization sub-pipeline.
//
// A simplifying assumption is that the maximum length of a single raster can't
// be larger than what fits in the raster builder ring.
//
// This would be a very long raster and is a legitimate size limitation.
//
// If a raster is exceeds this limit then the raster builder instance is lost.
//
// Note that this restriction can be removed with added complexity to the
// builder and shaders.
//
// The general strategy that this particular Vulkan implementation uses is to
// allocate a large "HOST_COHERENT" buffer for the ring.
//
// Note that the maximum number of "in-flight" rasterization sub-pipelines is
// conveniently determined by the size of the fence pool.
//
// The size of ring buffer is driven by the desired size limit of a single
// raster.
//
// The worst-case total storage per fill() invocation is:
//
// coherent
// - fills : 4 dwords
// - transforms : 8 dwords
// - clips : 4 dwords
// host
// - paths : 1 dword
// - rasters : 1 dword +
// ----------
// 18 dwords
//
// There are a maximum of (SPN_RASTER_COHORT_METAS_SIZE-1) rasters in a single
// cohort.
//
// A single raster will necessarily have a maximum number of
// paths/transforms/clips.
//
// Exceeding this limit terminates the raster builder.
//
// Note that the fills/paths count will always be 1:1 and potentially greater
// than the varying transforms/clips/rasters counts.
//
// Worst case is that the fills/transforms/clips/paths/rasters counts are all
// equal.
//
// Note that fill commands, transforms and clips may be read more than once by
// the rasterization sub-pipeline.
//
// Depending on the device architecture, it may be beneficial to copy the
// working region of the coherent buffer to a device-local buffer.
//
// If the Vulkan device is integrated or supports mapped write-through (AMD)
// then we don't need to copy. If the device is discrete and doesn't support
// write-through (NVIDIA) then we do.
//
// Note that the fill command can reduce its transform and clip fields to 13-16
// bits and fit into 3 dwords but it's easier to use a uint4 with GPUs.
//
// A non-affine transformation elevates a Bezier to a rational. For this
// reason, we indicate with a bit flag if the transform matrix has non-zero
// {w0,w1} elements.
//
//
//
// clang-format off
#define SPN_RASTER_BUILDER_RAST_TYPE_EXPAND() \
SPN_RASTER_BUILDER_RAST_TYPE_EXPAND_X(proj_line, SPN_RAST_TYPE_PROJ_LINE, 4) \
SPN_RASTER_BUILDER_RAST_TYPE_EXPAND_X(proj_quad, SPN_RAST_TYPE_PROJ_QUAD, 6) \
SPN_RASTER_BUILDER_RAST_TYPE_EXPAND_X(proj_cubic, SPN_RAST_TYPE_PROJ_CUBIC, 8) \
SPN_RASTER_BUILDER_RAST_TYPE_EXPAND_X(line, SPN_RAST_TYPE_LINE, 4) \
SPN_RASTER_BUILDER_RAST_TYPE_EXPAND_X(quad, SPN_RAST_TYPE_QUAD, 6) \
SPN_RASTER_BUILDER_RAST_TYPE_EXPAND_X(cubic, SPN_RAST_TYPE_CUBIC, 8) \
SPN_RASTER_BUILDER_RAST_TYPE_EXPAND_X(rat_quad, SPN_RAST_TYPE_RAT_QUAD, 7) \
SPN_RASTER_BUILDER_RAST_TYPE_EXPAND_X(rat_cubic, SPN_RAST_TYPE_RAT_CUBIC, 10)
//
// TODO(allanmac): Unify the .cf and .tc rings since they're both quads.
//
//
// The fill command layout is the same on both the host and device.
//
struct spinel_cmd_fill
{
uint32_t path_h; // host id
uint32_t na0 : 16; // unused
uint32_t cohort : 15; // cohort is 8-11 bits
uint32_t transform_type : 1; // transform type: 0=affine,1=projective
uint32_t transform; // index of first quad of transform
uint32_t clip; // index of clip quad
};
// clang-format on
STATIC_ASSERT_MACRO_1(sizeof(struct spinel_cmd_fill) == sizeof(uint32_t[4]));
//
// Ring work span
//
struct spinel_rbi_head_span
{
uint32_t head;
uint32_t span;
};
//
// Dispatch states
//
typedef enum spinel_rbi_dispatch_state_e
{
SPN_RBI_DISPATCH_STATE_INVALID,
SPN_RBI_DISPATCH_STATE_RECORDING,
SPN_RBI_DISPATCH_STATE_PENDING,
SPN_RBI_DISPATCH_STATE_COMPLETE,
} spinel_rbi_dispatch_state_e;
//
// There are always as many dispatch records as there are fences in
// the fence pool. This simplifies reasoning about concurrency.
//
// FIXME(allanmac): We don't have to track tc/rc once submitted
// FIXME(allanmac): We probably can drop most of the dbi structs
//
struct spinel_rbi_dispatch
{
struct
{
struct
{
struct spinel_dbi_devaddr internal;
struct spinel_dbi_devaddr indirect;
} rs;
struct spinel_dbi_devaddr ttrks; // ttrks + ttrks_keyvals_even
struct spinel_dbi_devaddr fill_scan; // used before sorting
struct spinel_dbi_devaddr rast_cmds; // used before sorting
struct spinel_dbi_devaddr ttrk_keyvals_odd; // used by radix and post-sort
} vk;
struct spinel_rbi_head_span cf; // fills and paths are 1:1
struct spinel_rbi_head_span tc; // transform quads and clips
struct spinel_rbi_head_span rc; // rasters in cohort
spinel_deps_delayed_semaphore_t delayed;
spinel_rbi_dispatch_state_e state;
};
//
// The host-side rings share a single host-coherent buffer:
//
// |<--cmds(uvec4)-->|<--transform.lo/hi & clip(vec4)-->|<--raster_h(uint)-->|
//
// Each ring has a different access pattern:
//
// ring | reads
// -----------+-------
// cmd_fills | 2
// transforms | 1+
// clips | 1+
// rasters | 1
//
// For this reason, some Vulkan devices may benefit from copying the
// ring spans from the host-coherent buffer to a device-local buffer.
//
struct spinel_rbi_vk
{
struct
{
struct
{
struct spinel_dbi_dm_devaddr h;
struct spinel_dbi_dm_devaddr d;
} cf;
struct
{
struct spinel_dbi_dm_devaddr h;
struct spinel_dbi_dm_devaddr d;
} tc;
struct
{
struct spinel_dbi_dm_devaddr h;
struct spinel_dbi_dm_devaddr d;
} rc;
} rings;
struct
{
struct spinel_dbi_dm ttrks;
struct spinel_dbi_dm rfs_rrc_tko;
struct
{
struct spinel_dbi_dm internal;
struct spinel_dbi_dm indirect;
} rs;
} dispatch;
};
//
//
//
struct spinel_raster_builder_impl
{
struct spinel_raster_builder * raster_builder;
struct spinel_device * device;
struct spinel_rbi_vk vk;
//
// As noted above, the remaining slots in the fills ring is always
// greater-than-or-equal to the remaining slots in the tcs ring so
// we use simpler accounting for tcs and rc.
//
struct
{
struct
{
struct spinel_cmd_fill * extent;
struct spinel_ring ring;
} cf; // fill commands
struct
{
SPN_TYPE_F32VEC4 * extent;
struct spinel_next next;
} tc; // transforms & clips
struct
{
spinel_handle_t * extent;
struct spinel_next next;
} rc; // rasters in cohort
} mapped;
//
// work in progress raster
//
struct
{
struct spinel_rbi_head_span cf; // fill commands
struct spinel_rbi_head_span tc; // transforms and clips
} wip;
//
// Resources released upon an grid completion:
//
// - Path handles can be released after rasterization stage.
//
// - Raster handles can be released after the entire rasterization
// sub-pipeline completes.
//
// - Dispatch records and associated mapped spans released in
// ring order.
//
spinel_weakref_epoch_t epoch;
struct
{
spinel_handle_t * extent;
} paths;
struct
{
spinel_handle_t * extent;
} rasters;
struct
{
struct spinel_rbi_dispatch * extent;
struct spinel_ring ring;
} dispatches;
};
//
//
//
static bool
spinel_rbi_is_staged(struct spinel_device const * device)
{
return !spinel_allocator_is_device_local(&device->allocator.device.perm.hw_dr);
}
//
// These pfns are installed when the Spinel context is lost
//
static spinel_result_t
spinel_rbi_lost_begin(struct spinel_raster_builder_impl * impl)
{
return SPN_ERROR_RASTER_BUILDER_LOST;
}
static spinel_result_t
spinel_rbi_lost_end(struct spinel_raster_builder_impl * impl, spinel_raster_t * raster)
{
*raster = SPN_RASTER_INVALID; // FIXME -- SPN_TYPED_HANDLE_INVALID
return SPN_ERROR_RASTER_BUILDER_LOST;
}
static spinel_result_t
spinel_rbi_release(struct spinel_raster_builder_impl * impl);
static spinel_result_t
spinel_rbi_lost_release(struct spinel_raster_builder_impl * impl)
{
//
// FIXME -- releasing a lost path builder might eventually require a
// specialized function. For now, just call the default release.
//
return spinel_rbi_release(impl);
}
static spinel_result_t
spinel_rbi_lost_flush(struct spinel_raster_builder_impl * impl)
{
return SPN_ERROR_RASTER_BUILDER_LOST;
}
static spinel_result_t
spinel_rbi_lost_add(struct spinel_raster_builder_impl * impl,
spinel_path_t const * paths,
spinel_transform_weakref_t * transform_weakrefs,
spinel_transform_t const * transforms,
spinel_clip_weakref_t * clip_weakrefs,
spinel_clip_t const * clips,
uint32_t count)
{
return SPN_ERROR_RASTER_BUILDER_LOST;
}
//
// If (wip.span == mapped.ring.size) then the raster is too long and
// the raster builder is terminally "lost". The raster builder should
// be released and a new one created.
//
static void
spinel_rbi_lost(struct spinel_raster_builder_impl * impl)
{
struct spinel_raster_builder * const rb = impl->raster_builder;
rb->begin = spinel_rbi_lost_begin;
rb->end = spinel_rbi_lost_end;
rb->release = spinel_rbi_lost_release;
rb->flush = spinel_rbi_lost_flush;
rb->add = spinel_rbi_lost_add;
}
static void
spinel_rbi_raster_append(struct spinel_raster_builder_impl * impl, spinel_handle_t handle)
{
uint32_t const idx = spinel_next_acquire_1(&impl->mapped.rc.next);
impl->mapped.rc.extent[idx] = handle; // device
impl->rasters.extent[idx] = handle; // host
}
//
// A dispatch captures how many paths and blocks are in a dispatched or
// the work-in-progress compute grid.
//
static struct spinel_rbi_dispatch *
spinel_rbi_dispatch_head(struct spinel_raster_builder_impl * impl)
{
assert(!spinel_ring_is_empty(&impl->dispatches.ring));
return impl->dispatches.extent + impl->dispatches.ring.head;
}
static struct spinel_rbi_dispatch *
spinel_rbi_dispatch_tail(struct spinel_raster_builder_impl * impl)
{
assert(!spinel_ring_is_full(&impl->dispatches.ring));
return impl->dispatches.extent + impl->dispatches.ring.tail;
}
static void
spinel_rbi_dispatch_drop(struct spinel_raster_builder_impl * impl)
{
struct spinel_ring * const ring = &impl->dispatches.ring;
spinel_ring_drop_1(ring);
}
static void
spinel_rbi_dispatch_head_init(struct spinel_raster_builder_impl * impl)
{
//
// Don't initialize this with a spinel_rbi_dispatch struct and designated
// initializers because each dispatch structure has precalculated .dbi
// buffers.
//
// Per-member initializers are fine.
//
struct spinel_rbi_dispatch * const dispatch = spinel_rbi_dispatch_head(impl);
assert(dispatch->state == SPN_RBI_DISPATCH_STATE_INVALID);
dispatch->cf = impl->wip.cf;
dispatch->tc = impl->wip.tc;
dispatch->rc = (struct spinel_rbi_head_span){ .head = impl->mapped.rc.next.head, .span = 0 };
dispatch->delayed = SPN_DEPS_DELAYED_SEMAPHORE_INVALID;
dispatch->state = SPN_RBI_DISPATCH_STATE_RECORDING;
}
static void
spinel_rbi_dispatch_acquire(struct spinel_raster_builder_impl * impl)
{
struct spinel_ring * const ring = &impl->dispatches.ring;
struct spinel_device * const device = impl->device;
while (spinel_ring_is_empty(ring))
{
spinel_deps_drain_1(device->deps, &device->vk);
}
spinel_rbi_dispatch_head_init(impl);
}
//
//
//
static void
spinel_rbi_dispatch_append_wip(struct spinel_raster_builder_impl * impl,
struct spinel_rbi_dispatch * dispatch)
{
dispatch->cf.span += impl->wip.cf.span;
dispatch->tc.span += impl->wip.tc.span;
dispatch->rc.span += 1;
}
//
// We record where the *next* work-in-progress raster will start in the ring
// along with its rolling counter.
//
static void
spinel_rbi_wip_reset(struct spinel_raster_builder_impl * impl)
{
impl->wip.cf = (struct spinel_rbi_head_span){ .head = impl->mapped.cf.ring.head, .span = 0 };
impl->wip.tc = (struct spinel_rbi_head_span){ .head = impl->mapped.tc.next.head, .span = 0 };
}
//
//
//
static void
spinel_rbi_copy_ring(VkCommandBuffer cb,
VkDescriptorBufferInfo const * h,
VkDescriptorBufferInfo const * d,
VkDeviceSize elem_size,
uint32_t ring_size,
struct spinel_rbi_head_span const * head_span)
{
VkBufferCopy bcs[2];
uint32_t bc_count;
bool const is_wrap = (head_span->span + head_span->head) > ring_size;
uint32_t const span_hi = is_wrap ? (ring_size - head_span->head) : head_span->span;
VkDeviceSize offset_hi = elem_size * head_span->head;
bcs[0].srcOffset = h->offset + offset_hi;
bcs[0].dstOffset = d->offset + offset_hi;
bcs[0].size = elem_size * span_hi;
if (is_wrap)
{
uint32_t const span_lo = head_span->span - span_hi;
bcs[1].srcOffset = h->offset;
bcs[1].dstOffset = d->offset;
bcs[1].size = elem_size * span_lo;
bc_count = 2;
}
else
{
bc_count = 1;
}
vkCmdCopyBuffer(cb, h->buffer, d->buffer, bc_count, bcs);
}
//
//
//
static void
spinel_rbi_flush_complete(void * data0, void * data1)
{
struct spinel_raster_builder_impl * const impl = data0;
struct spinel_rbi_dispatch * const dispatch = data1;
struct spinel_device * const device = impl->device;
//
// These raster handles are now materialized so invalidate their dependencies.
//
spinel_deps_delayed_detach_ring(device->deps,
impl->rasters.extent,
impl->mapped.rc.next.size,
dispatch->rc.head,
dispatch->rc.span);
//
// Release paths -- may invoke wait()
//
// FIXME(allanmac): Paths could be released much earlier if we're willing to
// complicate the submission and launch an additional command buffer.
//
spinel_device_release_d_paths_ring(device,
impl->paths.extent,
impl->mapped.cf.ring.size,
dispatch->cf.head,
dispatch->cf.span);
//
// Release the rasters -- may invoke wait()
//
spinel_device_release_d_rasters_ring(device,
impl->rasters.extent,
impl->mapped.rc.next.size,
dispatch->rc.head,
dispatch->rc.span);
//
// If the dispatch is the tail of the ring then try to release as many
// dispatch records as possible...
//
// Note that dispatches can complete in any order but the ring releases need
// to occur in order.
//
assert(dispatch->state != SPN_RBI_DISPATCH_STATE_COMPLETE);
dispatch->state = SPN_RBI_DISPATCH_STATE_COMPLETE;
struct spinel_rbi_dispatch * tail = spinel_rbi_dispatch_tail(impl);
while (tail->state == SPN_RBI_DISPATCH_STATE_COMPLETE)
{
// will always be true
assert(impl->mapped.cf.ring.tail == tail->cf.head);
// release the blocks and cmds
spinel_ring_release_n(&impl->mapped.cf.ring, tail->cf.span);
// release the dispatch
spinel_ring_release_n(&impl->dispatches.ring, 1);
// mark as invalid
tail->state = SPN_RBI_DISPATCH_STATE_INVALID;
// any dispatches still in flight?
if (spinel_ring_is_full(&impl->dispatches.ring))
{
break;
}
// get new tail
tail = spinel_rbi_dispatch_tail(impl);
}
}
//
//
//
static VkPipelineStageFlags
spinel_rbi_flush_record(VkCommandBuffer cb, void * data0, void * data1)
{
//
// 0. ZEROES & COPY
//
// Prepares device-side data structures.
//
// 1. FILL_SCAN
//
// Compute the prefix sum of each path type in the fill's path.
//
// 2. FILL_DISPATCH
//
// Take the atomically updated count of rasterization commands and
// initialize a workgroup triple for vkCmdDispatchIndirect().
//
// 3. FILL_EXPAND
//
// Expand the fill command into rasterization commands and store them to
// a temporary buffer:
//
// |<lines><quads><cubics><rat_quads><rat_cubics>|
//
// 4. RASTERIZE_LINES/QUADS/CUBICS/RAT_QUADS/RAT_CUBICS
//
// For each path type, indirectly dispatch a rasterizer.
//
// 5. INDIRECT RADIX SORT TTRK KEYS
//
// 6. SEGMENT_TTRK_DISPATCH
//
// 7. SEGMENT_TTRK
//
// 8. RASTERS_ALLOC
//
// 9. RASTERS_PREFIX
//
struct spinel_raster_builder_impl * const impl = data0;
struct spinel_rbi_dispatch * const dispatch = data1;
struct spinel_device * const device = impl->device;
////////////////////////////////////////////////////////////////
//
// FILL: ZERO RASTER COHORT META TABLE
// FILL: ZERO RASTERIZE.FILL_SCAN_COUNTS
//
////////////////////////////////////////////////////////////////
{
//
// * Zero ttrks.meta SoA arrays *after* .alloc[]
// * Zero ttrks.count_dispatch
//
// NOTE(allanmac): The ttrks.meta SoA fills have no dependencies until step
// (7) so there may be an opportunity to delay this. But the
// ttrks.count_dispatch needs to be zeroed earlier. Since they're adjacent
// we go ahead and zero them together.
//
// clang-format off
VkDeviceSize const meta_offset = SPN_BUFFER_OFFSETOF(ttrks, meta.rk_off);
VkDeviceSize const count_dispatch_offset = SPN_BUFFER_OFFSETOF(ttrks, count_dispatch);
VkDeviceSize const count_dispatch_size = SPN_BUFFER_MEMBER_SIZE(ttrks, count_dispatch);
VkDeviceSize const span = count_dispatch_offset + count_dispatch_size - meta_offset;
// clang-format on
vkCmdFillBuffer(cb,
dispatch->vk.ttrks.dbi.buffer,
dispatch->vk.ttrks.dbi.offset + meta_offset,
span,
0);
}
////////////////////////////////////////////////////////////////
//
// FILL: ZERO RASTERIZE.FILL_SCAN_COUNTS
//
////////////////////////////////////////////////////////////////
{
VkDeviceSize const offset = SPN_BUFFER_OFFSETOF(rasterize_fill_scan, counts);
VkDeviceSize const size = SPN_BUFFER_MEMBER_SIZE(rasterize_fill_scan, counts);
vkCmdFillBuffer(cb,
dispatch->vk.fill_scan.dbi.buffer,
dispatch->vk.fill_scan.dbi.offset + offset,
size,
0);
}
////////////////////////////////////////////////////////////////
//
// COPY COMMAND RINGS
//
// On a discrete GPU, 1-2 regions of 3 rings are copied from H>D
//
// FIXME(allanmac): Only the .cf ring is used by fill_scan so the .tc and .rc
// copies could be delayed.
//
////////////////////////////////////////////////////////////////
struct spinel_target_config const * const config = &device->ti.config;
if (spinel_rbi_is_staged(device))
{
// CF
spinel_rbi_copy_ring(cb,
&impl->vk.rings.cf.h.dbi_dm.dbi,
&impl->vk.rings.cf.d.dbi_dm.dbi,
sizeof(*impl->mapped.cf.extent),
impl->mapped.cf.ring.size,
&dispatch->cf);
// TC
spinel_rbi_copy_ring(cb,
&impl->vk.rings.tc.h.dbi_dm.dbi,
&impl->vk.rings.tc.d.dbi_dm.dbi,
sizeof(*impl->mapped.tc.extent),
impl->mapped.tc.next.size,
&dispatch->tc);
// RC
spinel_rbi_copy_ring(cb,
&impl->vk.rings.rc.h.dbi_dm.dbi,
&impl->vk.rings.rc.d.dbi_dm.dbi,
sizeof(*impl->mapped.rc.extent),
impl->mapped.rc.next.size,
&dispatch->rc);
}
////////////////////////////////////////////////////////////////
//
// BARRIER: FILLS & COPIES
//
////////////////////////////////////////////////////////////////
vk_barrier_transfer_w_to_compute_r(cb);
////////////////////////////////////////////////////////////////
//
// PIPELINE: FILL_SCAN
//
////////////////////////////////////////////////////////////////
struct spinel_push_fill_scan const push_fill_scan = {
.devaddr_rasterize_fill_scan = dispatch->vk.fill_scan.devaddr,
.devaddr_rasterize_fill_cmds = impl->vk.rings.cf.d.devaddr,
.devaddr_block_pool_blocks = device->block_pool.vk.dbi_devaddr.blocks.devaddr,
.devaddr_block_pool_host_map = device->block_pool.vk.dbi_devaddr.host_map.devaddr,
.cmd_head = dispatch->cf.head,
.cmd_size = impl->mapped.cf.ring.size,
.cmd_span = dispatch->cf.span
};
vkCmdPushConstants(cb,
device->ti.pipeline_layouts.named.fill_scan,
VK_SHADER_STAGE_COMPUTE_BIT,
0,
sizeof(push_fill_scan),
&push_fill_scan);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, device->ti.pipelines.named.fill_scan);
{
//
// Each invocation processes multiple commands.
//
uint32_t const cmds_per_wg = config->raster_builder.fill_scan.rows * //
config->group_sizes.named.fill_scan.workgroup;
uint32_t const wg_count = (dispatch->cf.span + cmds_per_wg - 1) / cmds_per_wg;
vkCmdDispatch(cb, wg_count, 1, 1);
}
////////////////////////////////////////////////////////////////
//
// BARRIER: COMPUTE>COMPUTE
//
////////////////////////////////////////////////////////////////
vk_barrier_compute_w_to_compute_r(cb);
////////////////////////////////////////////////////////////////
//
// PIPELINE: FILL_DISPATCH
//
// NOTE: PUSH CONSTANTS ARE COMPATIBLE WITH FILL_SCAN
//
// A single workgroup initializes the indirect dispatches
//
// Either 4 or 8 invocations are required (SPN_RAST_TYPE_COUNT == 8)
//
////////////////////////////////////////////////////////////////
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, device->ti.pipelines.named.fill_dispatch);
vkCmdDispatch(cb, 1, 1, 1);
////////////////////////////////////////////////////////////////
//
// BARRIER: COMPUTE>COMPUTE
//
// Note that FILL_EXPAND reads the u32vec4 initialized by FILL_DISPATCH but
// only RASTERIZE_XXX indirectly dispatches off of the u32vec4.
//
////////////////////////////////////////////////////////////////
vk_barrier_compute_w_to_compute_r(cb);
////////////////////////////////////////////////////////////////
//
// PIPELINE: FILL_EXPAND
//
// NOTE: PUSH CONSTANTS ARE MOSTLY COMPATIBLE WITH FILL_SCAN
//
////////////////////////////////////////////////////////////////
vkCmdPushConstants(cb,
device->ti.pipeline_layouts.named.fill_expand,
VK_SHADER_STAGE_COMPUTE_BIT,
SPN_PUSH_OFFSETOF(fill_expand, devaddr_rasterize_rast_cmds),
SPN_PUSH_MEMBER_SIZE(fill_expand, devaddr_rasterize_rast_cmds),
&dispatch->vk.rast_cmds.devaddr);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, device->ti.pipelines.named.fill_expand);
{
//
// Dispatch one subgroup per command
//
uint32_t const sgs_per_wg = config->group_sizes.named.fill_expand.workgroup >> //
config->group_sizes.named.fill_expand.subgroup_log2;
uint32_t const wg_count = (dispatch->cf.span + sgs_per_wg - 1) / sgs_per_wg;
vkCmdDispatch(cb, wg_count, 1, 1);
}
////////////////////////////////////////////////////////////////
//
// BARRIER: COMPUTE>INDIRECT|COMPUTE
//
// Note that the indirect dispatch was initialized by `fill_dispatch`.
//
////////////////////////////////////////////////////////////////
vk_barrier_compute_w_to_indirect_compute_r(cb);
////////////////////////////////////////////////////////////////
//
// TODO(allanmac): PIPELINE RASTERIZE_DISPATCH
//
// The indirect dispatch of the rasterization *may* need to support workgroups
// larger than one subgroup if the device architecture doesn't achieve max
// residency when dispatching single subgroup workgroups. This would require
// another compute shader to adjust the dispatch counts. For now, just assume
// (workgroup_size == subgroup_size).
//
////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////
//
// PIPELINES: RASTERIZE_PROJ_LINE
// RASTERIZE_PROJ_QUAD
// RASTERIZE_PROJ_CUBIC
// RASTERIZE_LINE
// RASTERIZE_QUAD
// RASTERIZE_CUBIC
// RASTERIZE_RAT_QUAD
// RASTERIZE_RAT_CUBIC
//
////////////////////////////////////////////////////////////////
assert(config->group_sizes.named.rasterize_line.workgroup ==
(1u << config->group_sizes.named.rasterize_line.subgroup_log2));
struct spinel_push_rasterize const push_rasterize = {
.devaddr_block_pool_ids = device->block_pool.vk.dbi_devaddr.ids.devaddr,
.devaddr_block_pool_blocks = device->block_pool.vk.dbi_devaddr.blocks.devaddr,
.devaddr_rasterize_fill_quads = impl->vk.rings.tc.d.devaddr,
.devaddr_rasterize_fill_scan = dispatch->vk.fill_scan.devaddr,
.devaddr_rasterize_rast_cmds = dispatch->vk.rast_cmds.devaddr,
.devaddr_ttrks = dispatch->vk.ttrks.devaddr,
.bp_mask = device->block_pool.bp_mask,
};
vkCmdPushConstants(cb,
device->ti.pipeline_layouts.named.rasterize_line,
VK_SHADER_STAGE_COMPUTE_BIT,
0,
sizeof(push_rasterize),
&push_rasterize);
VkBuffer const rasterize_buffer = dispatch->vk.fill_scan.dbi.buffer;
VkDeviceSize const rasterize_offset = dispatch->vk.fill_scan.dbi.offset + //
SPN_BUFFER_OFFSETOF(rasterize_fill_scan, dispatch);
#undef SPN_RASTER_BUILDER_RAST_TYPE_EXPAND_X
#define SPN_RASTER_BUILDER_RAST_TYPE_EXPAND_X(_p, _i, _n) \
{ \
vkCmdBindPipeline(cb, \
VK_PIPELINE_BIND_POINT_COMPUTE, \
device->ti.pipelines.named.rasterize_##_p); \
\
vkCmdDispatchIndirect(cb, rasterize_buffer, rasterize_offset + _i * sizeof(SPN_TYPE_U32VEC4)); \
}
SPN_RASTER_BUILDER_RAST_TYPE_EXPAND()
////////////////////////////////////////////////////////////////
//
// BARRIER: COMPUTE>INDIRECT|COMPUTE
//
////////////////////////////////////////////////////////////////
vk_barrier_compute_w_to_indirect_compute_r(cb);
////////////////////////////////////////////////////////////////
//
// RADIX SORT INDIRECT
//
// The "rasterize_fill_scan" "rasterize_rast_cmds" extents are no longer used
// at this point.
//
////////////////////////////////////////////////////////////////
VkDescriptorBufferInfo const dbi_ttrks_count = {
.buffer = dispatch->vk.ttrks.dbi.buffer,
.offset = dispatch->vk.ttrks.dbi.offset + SPN_BUFFER_OFFSETOF(ttrks, count_dispatch.w),
.range = sizeof(uint32_t),
};
VkDescriptorBufferInfo const dbi_ttrk_keyvals_even = {
.buffer = dispatch->vk.ttrks.dbi.buffer,
.offset = dispatch->vk.ttrks.dbi.offset + SPN_BUFFER_OFFSETOF(ttrks, keyvals),
.range = dispatch->vk.ttrks.dbi.range - SPN_BUFFER_OFFSETOF(ttrks, keyvals),
};
struct radix_sort_vk_sort_indirect_info const info = {
.ext = NULL,
.key_bits = SPN_TTRK_BITS_XY_COHORT,
.count = dbi_ttrks_count,
.keyvals_even = dbi_ttrk_keyvals_even,
.keyvals_odd = dispatch->vk.ttrk_keyvals_odd.dbi,
.internal = dispatch->vk.rs.internal.dbi,
.indirect = dispatch->vk.rs.indirect.dbi,
};
VkDescriptorBufferInfo dbi_ttrk_keyvals_out;
radix_sort_vk_sort_indirect(device->ti.rs, &info, device->vk.d, cb, &dbi_ttrk_keyvals_out);
// Device address of extent output by radix sort
VkDeviceAddress const devaddr_ttrk_keyvals_out = spinel_dbi_to_devaddr(device->vk.d, //
&dbi_ttrk_keyvals_out);
////////////////////////////////////////////////////////////////
//
// BARRIER: COMPUTE>COMPUTE
//
////////////////////////////////////////////////////////////////
vk_barrier_compute_w_to_compute_r(cb);
////////////////////////////////////////////////////////////////
//
// PIPELINE: TTRKS_SEGMENT_DISPATCH
//
// FIXME(allanmac): push_ttrks_segment_dispatch is "push compatible" with
// push_ttrks_segment
//
////////////////////////////////////////////////////////////////
struct spinel_push_ttrks_segment_dispatch const push_ttrks_segment_dispatch = {
.devaddr_ttrks_header = dispatch->vk.ttrks.devaddr,
};
vkCmdPushConstants(cb,
device->ti.pipeline_layouts.named.ttrks_segment_dispatch,
VK_SHADER_STAGE_COMPUTE_BIT,
0,
sizeof(push_ttrks_segment_dispatch),
&push_ttrks_segment_dispatch);
vkCmdBindPipeline(cb,
VK_PIPELINE_BIND_POINT_COMPUTE,
device->ti.pipelines.named.ttrks_segment_dispatch);
vkCmdDispatch(cb, 1, 1, 1); // A single invocation initializes the indirect dispatches
////////////////////////////////////////////////////////////////
//
// BARRIER: COMPUTE>INDIRECT|COMPUTE
//
////////////////////////////////////////////////////////////////
vk_barrier_compute_w_to_indirect_compute_r(cb);
////////////////////////////////////////////////////////////////
//
// PIPELINE: TTRKS_SEGMENT
//
// FIXME(allanmac): push_ttrks_segment_dispatch is "push compatible" with
// push_ttrks_segment
//
////////////////////////////////////////////////////////////////
struct spinel_push_ttrks_segment const push_ttrks_segment = {
.devaddr_ttrks_header = dispatch->vk.ttrks.devaddr,
.devaddr_ttrk_keyvals = devaddr_ttrk_keyvals_out,
};
vkCmdPushConstants(cb,
device->ti.pipeline_layouts.named.ttrks_segment,
VK_SHADER_STAGE_COMPUTE_BIT,
0,
sizeof(push_ttrks_segment),
&push_ttrks_segment);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, device->ti.pipelines.named.ttrks_segment);
VkDeviceSize const ttrks_segment_count_dispatch_offset =
dispatch->vk.ttrks.dbi.offset + SPN_BUFFER_OFFSETOF(ttrks, count_dispatch);
vkCmdDispatchIndirect(cb, dispatch->vk.ttrks.dbi.buffer, ttrks_segment_count_dispatch_offset);
////////////////////////////////////////////////////////////////
//
// BARRIER: COMPUTE>COMPUTE
//
////////////////////////////////////////////////////////////////
vk_barrier_compute_w_to_compute_r(cb);
////////////////////////////////////////////////////////////////
//
// PIPELINE: RASTERS_ALLOC
//
////////////////////////////////////////////////////////////////
struct spinel_push_rasters_alloc const push_rasters_alloc = {
.devaddr_raster_ids = impl->vk.rings.rc.d.devaddr,
.devaddr_ttrks_header = dispatch->vk.ttrks.devaddr,
.devaddr_block_pool_ids = device->block_pool.vk.dbi_devaddr.ids.devaddr,
.devaddr_block_pool_blocks = device->block_pool.vk.dbi_devaddr.blocks.devaddr,
.devaddr_block_pool_host_map = device->block_pool.vk.dbi_devaddr.host_map.devaddr,
.ids_size = impl->mapped.rc.next.size,
.ids_head = dispatch->rc.head,
.ids_span = dispatch->rc.span,
.bp_mask = device->block_pool.bp_mask,
};
vkCmdPushConstants(cb,
device->ti.pipeline_layouts.named.rasters_alloc,
VK_SHADER_STAGE_COMPUTE_BIT,
0,
sizeof(push_rasters_alloc),
&push_rasters_alloc);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, device->ti.pipelines.named.rasters_alloc);
{
//
// Dispatch one thread per raster rounded up to a workgroup
//
uint32_t const wg_size = config->group_sizes.named.rasters_alloc.workgroup;
uint32_t const wg_count = (dispatch->rc.span + wg_size - 1) / wg_size;
vkCmdDispatch(cb, wg_count, 1, 1);
}
////////////////////////////////////////////////////////////////
//
// BARRIER: COMPUTE>COMPUTE
//
////////////////////////////////////////////////////////////////
vk_barrier_compute_w_to_compute_r(cb);
////////////////////////////////////////////////////////////////
//
// PIPELINE: RASTERS_PREFIX
//
////////////////////////////////////////////////////////////////
struct spinel_push_rasters_prefix const push_rasters_prefix = {
.devaddr_block_pool_ids = device->block_pool.vk.dbi_devaddr.ids.devaddr,
.devaddr_block_pool_blocks = device->block_pool.vk.dbi_devaddr.blocks.devaddr,
.devaddr_ttrks_header = dispatch->vk.ttrks.devaddr,
.devaddr_ttrk_keyvals = devaddr_ttrk_keyvals_out,
.ids_size = impl->mapped.rc.next.size,
.ids_head = dispatch->rc.head,
.ids_span = dispatch->rc.span,
.bp_mask = device->block_pool.bp_mask,
};
vkCmdPushConstants(cb,
device->ti.pipeline_layouts.named.rasters_prefix,
VK_SHADER_STAGE_COMPUTE_BIT,
0,
sizeof(push_rasters_prefix),
&push_rasters_prefix);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, device->ti.pipelines.named.rasters_prefix);
{
//
// Dispatch one subgroup per raster
//
uint32_t const sgs_per_wg = config->group_sizes.named.rasters_prefix.workgroup >>
config->group_sizes.named.rasters_prefix.subgroup_log2;
uint32_t const wg_count = (dispatch->rc.span + sgs_per_wg - 1) / sgs_per_wg;
vkCmdDispatch(cb, wg_count, 1, 1);
}
//
// NOTE(allanmac):
//
// The `deps` scheduler assumes that the command buffers associated with
// delayed semaphores always end with a with a compute shader
// (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT).
//
// Only the path builder and raster builder acquire delayed semaphores.
//
return VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
}
//
//
//
static void
spinel_rbi_flush_submit(void * data0, void * data1)
{
struct spinel_raster_builder_impl * const impl = data0;
struct spinel_rbi_dispatch * const dispatch = data1;
struct spinel_device * const device = impl->device;
assert(dispatch->cf.span > 0);
assert(dispatch == spinel_rbi_dispatch_head(impl));
assert(dispatch->state == SPN_RBI_DISPATCH_STATE_RECORDING);
//
// Flush if the ring is non-coherent
//
if (!spinel_allocator_is_coherent(&device->allocator.device.perm.hw_dr))
{
// Flush CF
spinel_ring_flush(&device->vk,
impl->vk.rings.cf.h.dbi_dm.dm,
0UL,
impl->mapped.cf.ring.size,
dispatch->cf.head,
dispatch->cf.span,
sizeof(*impl->mapped.cf.extent));
// Flush TC
spinel_ring_flush(&device->vk,
impl->vk.rings.tc.h.dbi_dm.dm,
0UL,
impl->mapped.tc.next.size,
dispatch->tc.head,
dispatch->tc.span,
sizeof(*impl->mapped.tc.extent));
// Flush RC
spinel_ring_flush(&device->vk,
impl->vk.rings.rc.h.dbi_dm.dm,
0UL,
impl->mapped.rc.next.size,
dispatch->rc.head,
dispatch->rc.span,
sizeof(*impl->mapped.rc.extent));
}
//
// Acquire an immediate semaphore
//
struct spinel_deps_immediate_submit_info const disi = {
.record = {
.pfn = spinel_rbi_flush_record,
.data0 = impl,
.data1 = dispatch,
},
.wait = {
.delayed = {
.handles = {
.extent = impl->paths.extent,
.size = impl->mapped.cf.ring.size,
.head = dispatch->cf.head,
.span = dispatch->cf.span,
},
},
},
.completion = {
.pfn = spinel_rbi_flush_complete,
.data0 = impl,
.data1 = dispatch,
},
.signal = {
.delayed = {
.count = 1,
.semaphores = {
dispatch->delayed,
},
},
},
};
//
// Invalidate all outstanding transform and clip weakrefs
//
spinel_weakref_epoch_increment(&impl->epoch);
//
// The current dispatch is in flight so drop it
//
spinel_rbi_dispatch_drop(impl);
//
// Move to pending state
//
dispatch->state = SPN_RBI_DISPATCH_STATE_PENDING;
//
// We don't need to save the immediate semaphore
//
spinel_deps_immediate_submit(device->deps, &device->vk, &disi, NULL);
//
// Acquire and initialize the next dispatch
//
spinel_rbi_dispatch_acquire(impl);
}
//
//
//
static spinel_result_t
spinel_rbi_flush(struct spinel_raster_builder_impl * impl)
{
//
// Anything to launch?
//
struct spinel_rbi_dispatch const * const dispatch = spinel_rbi_dispatch_head(impl);
assert(dispatch->state == SPN_RBI_DISPATCH_STATE_RECORDING);
if (dispatch->rc.span == 0)
{
assert(dispatch->cf.span == 0);
return SPN_SUCCESS;
}
//
// Invoke the delayed submission action
//
spinel_deps_delayed_flush(impl->device->deps, dispatch->delayed);
return SPN_SUCCESS;
}
//
//
//
static spinel_result_t
spinel_rbi_begin(struct spinel_raster_builder_impl * impl)
{
return SPN_SUCCESS;
}
//
//
//
static spinel_result_t
spinel_rbi_end(struct spinel_raster_builder_impl * impl, spinel_raster_t * raster)
{
//
// acquire raster host id
//
struct spinel_device * const device = impl->device;
raster->handle = spinel_device_handle_acquire(device);
//
// get the head dispatch
//
struct spinel_rbi_dispatch * const dispatch = spinel_rbi_dispatch_head(impl);
assert(dispatch->state == SPN_RBI_DISPATCH_STATE_RECORDING);
//
// do we need to acquire a delayed semaphore for the dispatch?
//
if (dispatch->rc.span == 0)
{
struct spinel_deps_acquire_delayed_info const dadi = {
.submission = { .pfn = spinel_rbi_flush_submit, //
.data0 = impl,
.data1 = dispatch },
};
dispatch->delayed = spinel_deps_delayed_acquire(device->deps, &device->vk, &dadi);
}
// associate delayed semaphore with handle
spinel_deps_delayed_attach(device->deps, raster->handle, dispatch->delayed);
// save raster handle to ring
spinel_rbi_raster_append(impl, raster->handle);
// update head dispatch's span
spinel_rbi_dispatch_append_wip(impl, dispatch);
// reset wip
spinel_rbi_wip_reset(impl);
//
// * flush if the raster cohort size limit has been reached
// * flush if the fill command "eager" limit has been reached
//
struct spinel_target_config const * const config = &device->ti.config;
if ((dispatch->rc.span >= config->raster_builder.size.cohort) || // "is_rc_full"
(dispatch->cf.span >= config->raster_builder.size.eager)) // "is_cf_eager"
{
return spinel_rbi_flush(impl);
}
return SPN_SUCCESS;
}
//
// If the raster builder is directly exposed as a public API then
// validate the transform and clip weakref indices.
//
// If a fuzzer alters the weakref epoch then the weakref is invalid --
// we don't need to check for this case since that's the purpose of
// the weakref.
//
// If a fuzzer alters the weakref's index but its epoch still matches
// the the current epoch then we simply need to validate that its
// index is *potentially* valid -- the weakref might still be
// invalidated by about-to-happen spinel_rbi_flush().
//
// clang-format off
static spinel_result_t
spinel_rbi_validate_transform_weakref_indices(struct spinel_ring const * cf_ring,
struct spinel_rbi_dispatch const * dispatch,
spinel_transform_weakref_t const * transform_weakrefs,
uint32_t const count)
// clang-format on
{
//
// FIXME(allanmac)
//
// For non-null weakrefs, check to see index is within WIP span
//
return SPN_SUCCESS;
}
static spinel_result_t
spinel_rbi_validate_clip_weakref_indices(struct spinel_ring const * cf_ring,
struct spinel_rbi_dispatch const * dispatch,
spinel_clip_weakref_t const * clip_weakrefs,
uint32_t const count)
{
//
// FIXME(allanmac)
//
// For non-null weakrefs, check to see index is within WIP span
//
return SPN_SUCCESS;
}
//
// Permute lo and hi transform
//
// src: { sx shx tx shy sy ty w0 w1 } // Row-ordered matrix
// dst: { sx shx shy sy tx ty w0 w1 } // GPU-friendly ordering
//
static void
spinel_rbi_transform_copy_lo(SPN_TYPE_F32VEC4 * dst, spinel_transform_t const * src)
{
dst->x = src->sx;
dst->y = src->shx;
dst->z = src->shy;
dst->w = src->sy;
}
static void
spinel_rbi_transform_copy_hi(SPN_TYPE_F32VEC4 * dst, spinel_transform_t const * src)
{
dst->x = src->tx;
dst->y = src->ty;
dst->z = src->w0;
dst->w = src->w1;
}
//
//
//
static spinel_result_t
spinel_rbi_add(struct spinel_raster_builder_impl * impl,
spinel_path_t const * paths,
spinel_transform_weakref_t * transform_weakrefs,
spinel_transform_t const * transforms,
spinel_clip_weakref_t * clip_weakrefs,
spinel_clip_t const * clips,
uint32_t count)
{
// Anything to do?
if (count == 0)
{
return SPN_SUCCESS;
}
//
// If the number of work-in-progress paths is larger than the ring then fail
// hard and lose the raster builder.
//
struct spinel_ring * const cf_ring = &impl->mapped.cf.ring;
if (impl->wip.cf.span + count > cf_ring->size)
{
spinel_rbi_lost(impl);
return SPN_ERROR_RASTER_BUILDER_LOST;
}
//
// If not enough entries are left in the command ring then flush now and wait
// for cf slots to be made available.
//
struct spinel_device * const device = impl->device;
if (count > cf_ring->rem)
{
//
// Launch whatever is in the ring and then wait for cf slots...
//
spinel_rbi_flush(impl);
// clang-format off
do {
spinel_deps_drain_1(device->deps, &device->vk);
} while (count > cf_ring->rem);
// clang-format on
}
//
// Validate the paths before we proceed
//
spinel_result_t result;
result = spinel_device_validate_d_paths(device, paths, count);
if (result != SPN_SUCCESS)
{
return result;
}
//
// Validate the transform and clip weakref indices -- this is cheap!
//
struct spinel_rbi_dispatch * const dispatch = spinel_rbi_dispatch_head(impl);
assert(dispatch->state == SPN_RBI_DISPATCH_STATE_RECORDING);
result = spinel_rbi_validate_transform_weakref_indices(cf_ring, //
dispatch,
transform_weakrefs,
count);
if (result != SPN_SUCCESS)
{
return result;
}
result = spinel_rbi_validate_clip_weakref_indices(cf_ring, //
dispatch,
clip_weakrefs,
count);
if (result != SPN_SUCCESS)
{
return result;
}
//
// Everything validates... retain the paths on the device
//
spinel_device_retain_d_paths(device, paths, count);
//
// Increment the cf span
//
impl->wip.cf.span += count;
//
// There will always be enough room in the TC ring so only its head
// needs to be tracked.
//
struct spinel_next * const tc_next = &impl->mapped.tc.next;
//
// The command's cohort id is the same for all commands
//
struct spinel_cmd_fill cf = { 0 };
cf.cohort = dispatch->rc.span & 0x7fff;
//
// Append commands to the cf ring and dependent quads to the tc ring
//
while (true)
{
uint32_t const cf_idx = spinel_ring_acquire_1(cf_ring);
//
// Get the path
//
uint32_t const handle = paths->handle;
impl->paths.extent[cf_idx] = handle;
cf.path_h = handle;
//
// Classify the transform
//
// if (w0==w1==0) then it's an affine matrix
//
cf.transform_type = ((transforms->w0 == 0.0f) && (transforms->w1 == 0.0f))
? SPN_CMD_FILL_TRANSFORM_TYPE_AFFINE
: SPN_CMD_FILL_TRANSFORM_TYPE_PROJECTIVE;
//
// If the weakref exists then reuse existing transform index
//
if (!spinel_transform_weakrefs_get_index(transform_weakrefs, 0, &impl->epoch, &cf.transform))
{
uint32_t span;
uint32_t const t_idx = spinel_next_acquire_2(tc_next, &span);
impl->wip.tc.span += span;
spinel_transform_weakrefs_init(transform_weakrefs, 0, &impl->epoch, t_idx);
spinel_rbi_transform_copy_lo(impl->mapped.tc.extent + t_idx + 0, transforms);
spinel_rbi_transform_copy_hi(impl->mapped.tc.extent + t_idx + 1, transforms);
cf.transform = t_idx;
}
//
// If the weakref exists then reuse existing clip index
//
if (!spinel_clip_weakrefs_get_index(clip_weakrefs, 0, &impl->epoch, &cf.clip))
{
uint32_t const c_idx = spinel_next_acquire_1(tc_next);
impl->wip.tc.span += 1;
spinel_clip_weakrefs_init(clip_weakrefs, 0, &impl->epoch, c_idx);
memcpy(impl->mapped.tc.extent + c_idx, clips, sizeof(*impl->mapped.tc.extent));
cf.clip = c_idx;
}
//
// Store the command to the ring
//
impl->mapped.cf.extent[cf_idx] = cf;
//
// No more paths?
//
if (--count == 0)
{
break;
}
//
// Otherwise, increment pointers
//
// FIXME(allanmac): This will be updated with an argument "template"
// struct.
//
paths += 1;
if (transform_weakrefs != NULL)
{
transform_weakrefs += 1;
}
transforms += 1;
if (clip_weakrefs != NULL)
{
clip_weakrefs += 1;
}
clips += 1;
}
return SPN_SUCCESS;
}
//
//
//
static spinel_result_t
spinel_rbi_release(struct spinel_raster_builder_impl * impl)
{
//
// launch any undispatched rasters
//
spinel_rbi_flush(impl);
//
// wait for all in-flight dispatches to complete
//
struct spinel_ring * const ring = &impl->dispatches.ring;
struct spinel_device * const device = impl->device;
while (!spinel_ring_is_full(ring))
{
spinel_deps_drain_1(device->deps, &device->vk);
}
//
// Dispatch extents
//
spinel_allocator_free_dbi_dm(&device->allocator.device.perm.drw,
device->vk.d,
device->vk.ac,
&impl->vk.dispatch.rfs_rrc_tko);
spinel_allocator_free_dbi_dm(&device->allocator.device.perm.drw,
device->vk.d,
device->vk.ac,
&impl->vk.dispatch.ttrks);
//
// Radix Sort extents
//
spinel_allocator_free_dbi_dm(&device->allocator.device.perm.drw,
device->vk.d,
device->vk.ac,
&impl->vk.dispatch.rs.indirect);
spinel_allocator_free_dbi_dm(&device->allocator.device.perm.drw,
device->vk.d,
device->vk.ac,
&impl->vk.dispatch.rs.internal);
//
// Ring extents
//
vkUnmapMemory(device->vk.d, impl->vk.rings.rc.h.dbi_dm.dm); // not necessary
vkUnmapMemory(device->vk.d, impl->vk.rings.tc.h.dbi_dm.dm); // not necessary
vkUnmapMemory(device->vk.d, impl->vk.rings.cf.h.dbi_dm.dm); // not necessary
spinel_allocator_free_dbi_dm(&device->allocator.device.perm.hw_dr,
device->vk.d,
device->vk.ac,
&impl->vk.rings.rc.h.dbi_dm);
spinel_allocator_free_dbi_dm(&device->allocator.device.perm.hw_dr,
device->vk.d,
device->vk.ac,
&impl->vk.rings.tc.h.dbi_dm);
spinel_allocator_free_dbi_dm(&device->allocator.device.perm.hw_dr,
device->vk.d,
device->vk.ac,
&impl->vk.rings.cf.h.dbi_dm);
//
// Ring staging extents
//
if (spinel_rbi_is_staged(device))
{
spinel_allocator_free_dbi_dm(&device->allocator.device.perm.drw,
device->vk.d,
device->vk.ac,
&impl->vk.rings.rc.d.dbi_dm);
spinel_allocator_free_dbi_dm(&device->allocator.device.perm.drw,
device->vk.d,
device->vk.ac,
&impl->vk.rings.tc.d.dbi_dm);
spinel_allocator_free_dbi_dm(&device->allocator.device.perm.drw,
device->vk.d,
device->vk.ac,
&impl->vk.rings.cf.d.dbi_dm);
}
//
// Free host allocations
//
free(impl->rasters.extent);
free(impl->paths.extent);
free(impl->dispatches.extent);
free(impl->raster_builder);
free(impl);
spinel_context_release(device->context);
return SPN_SUCCESS;
}
//
//
//
spinel_result_t
spinel_raster_builder_impl_create(struct spinel_device * device,
spinel_raster_builder_t * raster_builder)
{
spinel_context_retain(device->context);
//
// allocate impl
//
struct spinel_raster_builder_impl * const impl = MALLOC_MACRO(sizeof(*impl));
//
// allocate raster builder
//
struct spinel_raster_builder * const rb = MALLOC_MACRO(sizeof(*rb));
// init impl and rb back-pointers
*raster_builder = rb;
impl->raster_builder = rb;
rb->impl = impl;
// save device
impl->device = device;
//
// init raster builder pfns
//
rb->begin = spinel_rbi_begin;
rb->end = spinel_rbi_end;
rb->release = spinel_rbi_release;
rb->flush = spinel_rbi_flush;
rb->add = spinel_rbi_add;
//
// init refcount & state
//
rb->ref_count = 1;
SPN_ASSERT_STATE_INIT(SPN_RASTER_BUILDER_STATE_READY, rb);
//
// Allocate rings
//
struct spinel_target_config const * const config = &device->ti.config;
assert(config->raster_builder.size.eager <= config->raster_builder.size.ring);
//
// CF: 1 ring entry per command
//
uint32_t const cf_ring_size = config->raster_builder.size.ring;
spinel_ring_init(&impl->mapped.cf.ring, cf_ring_size);
//
// TC: 1 transform + 1 clip = 3 quads
//
// Worst case is 3 quads per command.
//
// NOTE(allanmac): One additional quad is required because transforms require
// 2 consecutive quads and the worst case would be a full ring of commands
// each with a transform and clip.
//
uint32_t const tc_ring_size = cf_ring_size * 3 + 1;
spinel_next_init(&impl->mapped.tc.next, tc_ring_size);
//
// How many dispatches?
//
uint32_t const max_in_flight = config->raster_builder.size.dispatches;
//
// RC: Worst case is (cohort size * dispatches) rasters.
//
assert(config->raster_builder.size.cohort <= SPN_RASTER_COHORT_MAX_SIZE);
uint32_t const rc_ring_size = config->raster_builder.size.cohort * max_in_flight;
spinel_next_init(&impl->mapped.rc.next, rc_ring_size);
//
// FIXME(allanmac): Allocate one buffer for all rings and one buffer for all
// staging buffers.
//
//
// Allocate and map CF
//
{
VkDeviceSize const cf_size = sizeof(*impl->mapped.cf.extent) * cf_ring_size;
VkDeviceSize const cf_size_ru = ROUND_UP_POW2_MACRO(cf_size, //
device->vk.limits.noncoherent_atom_size);
spinel_allocator_alloc_dbi_dm_devaddr(&device->allocator.device.perm.hw_dr,
device->vk.pd,
device->vk.d,
device->vk.ac,
cf_size_ru,
NULL,
&impl->vk.rings.cf.h);
vk(MapMemory(device->vk.d,
impl->vk.rings.cf.h.dbi_dm.dm,
0,
VK_WHOLE_SIZE,
0,
(void **)&impl->mapped.cf.extent));
//
// Allocate and map TC
//
VkDeviceSize const tc_size = sizeof(*impl->mapped.tc.extent) * tc_ring_size;
VkDeviceSize const tc_size_ru = ROUND_UP_POW2_MACRO(tc_size, //
device->vk.limits.noncoherent_atom_size);
spinel_allocator_alloc_dbi_dm_devaddr(&device->allocator.device.perm.hw_dr,
device->vk.pd,
device->vk.d,
device->vk.ac,
tc_size_ru,
NULL,
&impl->vk.rings.tc.h);
vk(MapMemory(device->vk.d,
impl->vk.rings.tc.h.dbi_dm.dm,
0,
VK_WHOLE_SIZE,
0,
(void **)&impl->mapped.tc.extent));
//
// Allocate and map RC
//
VkDeviceSize const rc_size = sizeof(*impl->mapped.rc.extent) * rc_ring_size;
VkDeviceSize const rc_size_ru = ROUND_UP_POW2_MACRO(rc_size, //
device->vk.limits.noncoherent_atom_size);
spinel_allocator_alloc_dbi_dm_devaddr(&device->allocator.device.perm.hw_dr,
device->vk.pd,
device->vk.d,
device->vk.ac,
rc_size_ru,
NULL,
&impl->vk.rings.rc.h);
vk(MapMemory(device->vk.d,
impl->vk.rings.rc.h.dbi_dm.dm,
0,
VK_WHOLE_SIZE,
0,
(void **)&impl->mapped.rc.extent));
//
// Allocate rasters
//
impl->rasters.extent = MALLOC_MACRO(rc_size_ru); // same size as mapped size
//
// Are host-writable rings used as staging buffers?
//
if (spinel_rbi_is_staged(device))
{
spinel_allocator_alloc_dbi_dm_devaddr(&device->allocator.device.perm.drw,
device->vk.pd,
device->vk.d,
device->vk.ac,
cf_size_ru,
NULL,
&impl->vk.rings.cf.d);
spinel_allocator_alloc_dbi_dm_devaddr(&device->allocator.device.perm.drw,
device->vk.pd,
device->vk.d,
device->vk.ac,
tc_size_ru,
NULL,
&impl->vk.rings.tc.d);
spinel_allocator_alloc_dbi_dm_devaddr(&device->allocator.device.perm.drw,
device->vk.pd,
device->vk.d,
device->vk.ac,
rc_size_ru,
NULL,
&impl->vk.rings.rc.d);
}
else
{
impl->vk.rings.cf.d = impl->vk.rings.cf.h;
impl->vk.rings.tc.d = impl->vk.rings.tc.h;
impl->vk.rings.rc.d = impl->vk.rings.rc.h;
}
}
//
// Allocate dispatches and path/raster release extents
//
// Implicitly sets state to SPN_RBI_DISPATCH_STATE_INVALID.
//
impl->dispatches.extent = CALLOC_MACRO(max_in_flight, sizeof(*impl->dispatches.extent));
//
// Allocate paths
//
size_t const paths_size = sizeof(*impl->paths.extent) * config->raster_builder.size.ring;
impl->paths.extent = MALLOC_MACRO(paths_size);
//
// Get radix sort memory requirements
//
struct radix_sort_vk_memory_requirements rs_mr;
radix_sort_vk_get_memory_requirements(device->ti.rs, config->raster_builder.size.ttrks, &rs_mr);
assert(SPN_MEMBER_ALIGN_LIMIT >= rs_mr.keyvals_alignment);
assert(SPN_MEMBER_ALIGN_LIMIT >= rs_mr.internal_alignment);
assert(SPN_MEMBER_ALIGN_LIMIT >= rs_mr.indirect_alignment);
//
// What is rounded-up size of ttrks buffer?
//
VkDeviceSize const ttrks_size = SPN_BUFFER_OFFSETOF(ttrks, keyvals) + rs_mr.keyvals_size;
VkDeviceSize const ttrks_size_ru = ROUND_UP_POW2_MACRO(ttrks_size, SPN_MEMBER_ALIGN_LIMIT);
//
// Allocate memory shared across dispatches:
//
// ttrks = max_in_flight * sizeof(ttrks)
//
spinel_allocator_alloc_dbi_dm(&device->allocator.device.perm.drw,
device->vk.pd,
device->vk.d,
device->vk.ac,
max_in_flight * ttrks_size_ru,
NULL,
&impl->vk.dispatch.ttrks);
//
// Assign bufrefs to dispatches
//
// FIXME(allanmac): Do all VK objects need both their .dbis and .devaddrs
// initialized?
//
for (uint32_t ii = 0; ii < max_in_flight; ii++)
{
struct spinel_rbi_dispatch * const dispatch = impl->dispatches.extent + ii;
// vk.ttrks
spinel_dbi_devaddr_from_dbi(device->vk.d,
&dispatch->vk.ttrks,
&impl->vk.dispatch.ttrks.dbi,
ii * ttrks_size_ru,
ttrks_size_ru);
}
//
// Allocate per-dispatch radix sort internal and indirect buffers
//
// internal = max_in_flight * rs_mr.internal_size
// indirect = max_in_flight * rs_mr.indirect_size
//
// clang-format off
VkDeviceSize const rs_internal_size_ru = ROUND_UP_POW2_MACRO(rs_mr.internal_size, SPN_MEMBER_ALIGN_LIMIT);
VkDeviceSize const rs_indirect_size_ru = ROUND_UP_POW2_MACRO(rs_mr.indirect_size, SPN_MEMBER_ALIGN_LIMIT);
// clang-format on
spinel_allocator_alloc_dbi_dm(&device->allocator.device.perm.drw,
device->vk.pd,
device->vk.d,
device->vk.ac,
max_in_flight * rs_internal_size_ru,
NULL,
&impl->vk.dispatch.rs.internal);
spinel_allocator_alloc_dbi_dm(&device->allocator.device.perm.drw,
device->vk.pd,
device->vk.d,
device->vk.ac,
max_in_flight * rs_indirect_size_ru,
NULL,
&impl->vk.dispatch.rs.indirect);
//
// Assign bufrefs to dispatches
//
// FIXME(allanmac): Do all VK objects need both their .dbis and .devaddrs
// initialized?
//
for (uint32_t ii = 0; ii < max_in_flight; ii++)
{
struct spinel_rbi_dispatch * const dispatch = impl->dispatches.extent + ii;
// vk.rs.internal
spinel_dbi_devaddr_from_dbi(device->vk.d,
&dispatch->vk.rs.internal,
&impl->vk.dispatch.rs.internal.dbi,
ii * rs_internal_size_ru,
rs_internal_size_ru);
// vk.rs.indirect
spinel_dbi_devaddr_from_dbi(device->vk.d,
&dispatch->vk.rs.indirect,
&impl->vk.dispatch.rs.indirect.dbi,
ii * rs_indirect_size_ru,
rs_indirect_size_ru);
}
//
// More per-dispatch buffers:
//
// rfs: rasterize fill scan
// rrc: rasterize rast cmds
// tko: ttrk keys odd
//
// We conservatively round them up to ensure the buffers are properly aligned
// on the device.
//
// The rfs/rrc extents can be safely aliased by the later-used tko extent.
//
// TODO(allanmac): The rounding and aliasing can be refined.
//
// clang-format off
uint32_t const rfs_sg_size = 1u << config->group_sizes.named.fill_scan.subgroup_log2;
VkDeviceSize const rfs_block_size_pow2 = sizeof(SPN_TYPE_U32VEC4) * 2 * rfs_sg_size;
VkDeviceSize const rfs_prefix_size = config->raster_builder.size.ring * sizeof(SPN_TYPE_U32VEC4) * 2;
VkDeviceSize const rfs_prefix_size_ru = ROUND_UP_POW2_MACRO(rfs_prefix_size, rfs_block_size_pow2);
VkDeviceSize const rfs_size = SPN_BUFFER_OFFSETOF(rasterize_fill_scan, prefix) + rfs_prefix_size_ru;
VkDeviceSize const rfs_size_ru = ROUND_UP_POW2_MACRO(rfs_size, SPN_MEMBER_ALIGN_LIMIT);
VkDeviceSize const rrc_size = config->raster_builder.size.cmds * sizeof(SPN_TYPE_U32VEC4);
VkDeviceSize const rrc_size_ru = ROUND_UP_POW2_MACRO(rrc_size, SPN_MEMBER_ALIGN_LIMIT);
VkDeviceSize const rfs_rrc_size_ru = rfs_size_ru + rrc_size_ru;
VkDeviceSize const tko_size_ru = ROUND_UP_POW2_MACRO(rs_mr.keyvals_size, SPN_MEMBER_ALIGN_LIMIT);
VkDeviceSize const rfs_rrc_tko_size_ru = MAX_MACRO(VkDeviceSize, rfs_rrc_size_ru, tko_size_ru);
// clang-format on
spinel_allocator_alloc_dbi_dm(&device->allocator.device.perm.drw,
device->vk.pd,
device->vk.d,
device->vk.ac,
max_in_flight * rfs_rrc_tko_size_ru,
NULL,
&impl->vk.dispatch.rfs_rrc_tko);
//
// Assign bufrefs to dispatches
//
// FIXME(allanmac): Do all VK objects need both their .dbis and .devaddrs
// initialized?
//
for (uint32_t ii = 0; ii < max_in_flight; ii++)
{
struct spinel_rbi_dispatch * const dispatch = impl->dispatches.extent + ii;
// vk.fill_scan
spinel_dbi_devaddr_from_dbi(device->vk.d,
&dispatch->vk.fill_scan,
&impl->vk.dispatch.rfs_rrc_tko.dbi,
ii * rfs_rrc_tko_size_ru,
rfs_size_ru);
// vk.rast_cmds
spinel_dbi_devaddr_from_dbi(device->vk.d,
&dispatch->vk.rast_cmds,
&impl->vk.dispatch.rfs_rrc_tko.dbi,
ii * rfs_rrc_tko_size_ru + rfs_size_ru,
rrc_size_ru);
// vk.ttrk_keyvals_odd
spinel_dbi_devaddr_from_dbi(device->vk.d,
&dispatch->vk.ttrk_keyvals_odd,
&impl->vk.dispatch.rfs_rrc_tko.dbi,
ii * rfs_rrc_tko_size_ru,
tko_size_ru);
}
//
// Initialize rings and first dispatch
//
spinel_ring_init(&impl->dispatches.ring, max_in_flight);
spinel_rbi_wip_reset(impl);
spinel_rbi_dispatch_head_init(impl);
spinel_weakref_epoch_init(&impl->epoch);
return SPN_SUCCESS;
}
//
//
//