blob: 88723d2f2ac16f5aecb0439bc5e51bee6a553eff [file] [log] [blame]
// Copyright 2019 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#version 460
//
// FILL DISPATCH
//
// This fixes up the 8 path primitive counts so they can be used by
// vkCmdDispatchIndirect().
//
// It also computes the exclusive prefix sum of the counts so each
// rasterization workgroup type (lines, quads, etc.) knows where to
// begin the cmd_rast[] buffer.
//
// The sum is stored in the 4th element of each quad.
//
//
//
//
#extension GL_GOOGLE_include_directive : require
#extension GL_KHR_shader_subgroup_basic : require
#extension GL_KHR_shader_subgroup_ballot : require
#extension GL_KHR_shader_subgroup_arithmetic : require
//
//
//
#extension GL_EXT_debug_printf : enable
//
//
//
#include "config.h"
#include "push.h"
//
// Local defines
//
#define SPN_FILL_DISPATCH_SUBGROUP_SIZE (1 << SPN_DEVICE_FILL_DISPATCH_SUBGROUP_SIZE_LOG2)
//
// Workgroup size
//
layout(local_size_x = SPN_FILL_DISPATCH_SUBGROUP_SIZE) in;
//
// Push constants
//
SPN_PUSH_LAYOUT_FILL_DISPATCH();
//
// Buffer references
//
SPN_BUFFER_DEFINE_RASTERIZE_FILL_SCAN(writeonly, readonly, noaccess);
//
// FIXME(allanmac): Support computing multi-subgroup workgroups.
//
void
main()
{
SPN_BUFREF_DEFINE(SPN_BUFFER_TYPE(rasterize_fill_scan),
fill_scan,
push.devaddr_rasterize_fill_scan);
#if (SPN_FILL_DISPATCH_SUBGROUP_SIZE == 4)
//
// SIMD4 -- ARM Bifrost4 and SwiftShader
//
u32vec4 dispatch[2] = { u32vec4(0, 1, 1, 0), u32vec4(0, 1, 1, 0) };
//
// Load 8 primitive counts.
//
dispatch[0][0] = fill_scan.counts[0 + gl_SubgroupInvocationID];
dispatch[1][0] = fill_scan.counts[4 + gl_SubgroupInvocationID];
//
// Inclusive scan-add of commands by type.
//
dispatch[0][3] = subgroupInclusiveAdd(dispatch[0][0]);
dispatch[1][3] = subgroupInclusiveAdd(dispatch[1][0]) + subgroupBroadcast(dispatch[0][3], 3);
//
// Convert back to exclusive scan-add.
//
dispatch[0][3] -= dispatch[0][0];
dispatch[1][3] -= dispatch[1][0];
//
// Store 8 primitive counts;
//
fill_scan.dispatch[0 + gl_SubgroupInvocationID] = dispatch[0];
fill_scan.dispatch[4 + gl_SubgroupInvocationID] = dispatch[1];
#elif (SPN_FILL_DISPATCH_SUBGROUP_SIZE >= 8)
//
// SIMD8+ -- every other GPU I'm aware of...
//
u32vec4 dispatch = u32vec4(0, 1, 1, 0);
// Is valid lane?
#if (SPN_FILL_DISPATCH_SUBGROUP_SIZE > 8)
const bool is_valid = (gl_SubgroupInvocationID < SPN_RAST_TYPE_COUNT);
#endif
//
// Load 8 primitive counts.
//
#if (SPN_FILL_DISPATCH_SUBGROUP_SIZE > 8)
if (is_valid)
#endif
{
//
// TODO(https://fxbug.dev/42052237): This eventually needs to setup the
// dispatch based on rounded-up workgroup size and not subgroup
// size.
//
dispatch[0] = fill_scan.counts[gl_SubgroupInvocationID];
}
//
// Exclusive scan-add of commands by type.
//
dispatch[3] = subgroupExclusiveAdd(dispatch[0]);
//
// Store 8 primitive counts.
//
#if (SPN_FILL_DISPATCH_SUBGROUP_SIZE > 8)
if (is_valid)
#endif
{
fill_scan.dispatch[gl_SubgroupInvocationID] = dispatch;
}
#else
#error "Unexpected subgroup size!"
#endif
}
//
//
//