blob: d6b4befe63d936dbf36ec9300a8241e1204c8b47 [file] [log] [blame]
// Copyright 2017 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "lib/escher/impl/ssdo_accelerator.h"
#include "lib/escher/escher.h"
#include "lib/escher/impl/command_buffer.h"
#include "lib/escher/impl/compute_shader.h"
#include "lib/escher/renderer/frame.h"
#include "lib/escher/resources/resource_recycler.h"
#include "lib/escher/util/trace_macros.h"
#include "lib/escher/vk/buffer.h"
#include "lib/escher/vk/image.h"
#include "lib/escher/vk/image_factory.h"
#include "lib/escher/vk/texture.h"
namespace escher {
namespace impl {
namespace {
constexpr char g_kernel_src[] = R"GLSL(
#version 450
#extension GL_ARB_separate_shader_objects : enable
layout (binding = 0) uniform sampler2D depthImage;
layout (binding = 1, rgba8) uniform image2D resultImage;
// Must be <= 4, otherwise margins will be too small. See below.
const int kRadius = 4;
// The area of interest is 8-wide so that we can pack the bits into a channel
// of an 8-bit image. It is 8-high just because.
const int kEight = 8;
// Holds the 4-way neighbor relationships for the area of interest, plus a
// 8-wide kRadius-high area above and below.
//
// A 1-bit in the
// R channel means that the cell is higher than its up-neighbor.
// G channel means that the cell is higher than its down-neighbor.
// B channel means that the cell is higher than its left-neighbor.
// A channel means that the cell is higher than its right-neighbor.
const int kNeighborhoodHeight = kEight + 2 * kRadius;
const int kNeighborhoodWidth = kEight + 2 * kRadius;
// NOTE: we put the two variables below in shared memory even though we have a
// single thread per workgroup, in order to avoid exceeding the available
// number of registers. On a particular NVIDIA GPU, making 'roi' shared is
// enough, and making 'depths' shared actually reduces performance. However, on
// the Acer Switch 12 Alpha, both are necessary to avoid VK_ERROR_DEVICE_LOST.
// The 'region of interest' that stores the intermediate data structure.
shared ivec4 roi[kNeighborhoodHeight];
void computeNeighborRelationships() {
float depths0[kNeighborhoodWidth];
float depths1[kNeighborhoodWidth];
ivec2 depth_base =
ivec2(gl_GlobalInvocationID.xy) * kEight - ivec2(kRadius, kRadius);
for (uint x = 0; x < kNeighborhoodWidth; ++x) {
// Load into depths1. It will be copied into depths0 before using
// (wasteful, but makes the code cleaner), hence the 'y = 0' below.
depths1[x] = texture(depthImage, depth_base + ivec2(x, 0)).r;
}
// Bottom row doesn't cast shadows downward, and top row doesn't cast
// shadows upward.
roi[kNeighborhoodHeight - 1].g = 0;
roi[0].r = 0;
// Compute leftward/rightward shadow-casting for top row.
{
int casts_rightward = 0;
int casts_leftward = 0;
/* What follows is a vectorized version of this code:
for (uint x = 1; x < kNeighborhoodWidth; ++x) {
float diff = depths1[(x - 1)] - depths1[x];
casts_rightward += (diff < 0.0) ? (1 << x) : 0;
casts_leftward += (diff > 0.0) ? (2 << x) : 0;
}
*/
vec4 diff1 = vec4(depths1[0], depths1[1], depths1[2], depths1[3]);
vec4 diff2 = vec4(depths1[1], depths1[2], depths1[3], depths1[4]);
diff1 -= diff2;
casts_rightward += int(dot(ivec4(1, 2, 4, 8),
mix(ivec4(0, 0, 0, 0),
ivec4(1, 1, 1, 1),
lessThan(diff1, vec4(0, 0, 0, 0)))));
casts_leftward += int(dot(ivec4(1, 2, 4, 8),
mix(ivec4(0, 0, 0, 0),
ivec4(1, 1, 1, 1),
greaterThan(diff1, vec4(0, 0, 0, 0))))) << 1;
diff1 = vec4(depths1[4], depths1[5], depths1[6], depths1[7]);
diff2 = vec4(depths1[5], depths1[6], depths1[7], depths1[8]);
diff1 -= diff2;
casts_rightward += int(dot(ivec4(1, 2, 4, 8),
mix(ivec4(0, 0, 0, 0),
ivec4(1, 1, 1, 1),
lessThan(diff1, vec4(0, 0, 0, 0))))) << 4;
casts_leftward += int(dot(ivec4(1, 2, 4, 8),
mix(ivec4(0, 0, 0, 0),
ivec4(1, 1, 1, 1),
greaterThan(diff1, vec4(0, 0, 0, 0))))) << 5;
diff1 = vec4(depths1[8], depths1[9], depths1[10], depths1[11]);
diff2 = vec4(depths1[9], depths1[10], depths1[11], depths1[12]);
diff1 -= diff2;
casts_rightward += int(dot(ivec4(1, 2, 4, 8),
mix(ivec4(0, 0, 0, 0),
ivec4(1, 1, 1, 1),
lessThan(diff1, vec4(0, 0, 0, 0))))) << 8;
casts_leftward += int(dot(ivec4(1, 2, 4, 8),
mix(ivec4(0, 0, 0, 0),
ivec4(1, 1, 1, 1),
greaterThan(diff1, vec4(0, 0, 0, 0))))) << 9;
diff1 = vec4(depths1[12], depths1[13], depths1[14], depths1[15]);
// Note that the last value is repeated, to not go out-of-bounds.
diff2 = vec4(depths1[13], depths1[14], depths1[15], depths1[15]);
diff1 -= diff2;
casts_rightward += int(dot(ivec4(1, 2, 4, 8),
mix(ivec4(0, 0, 0, 0),
ivec4(1, 1, 1, 1),
lessThan(diff1, vec4(0, 0, 0, 0))))) << 12;
casts_leftward += int(dot(ivec4(1, 2, 4, 8),
mix(ivec4(0, 0, 0, 0),
ivec4(1, 1, 1, 1),
greaterThan(diff1, vec4(0, 0, 0, 0))))) << 13;
roi[0].b = casts_leftward;
roi[0].a = casts_rightward;
}
for (uint y = 1; y < kNeighborhoodHeight; ++y) {
// Update depth values for 2-row subneighborhood, and compute upward/
// downward shadow-casting.
{
int casts_upward = 0;
int casts_downward = 0;
/* What follows is a vectorized version of this code:
for (uint x = 0; x < kNeighborhoodWidth; ++x) {
depths0[x] = depths1[x];
depths1[x] = texture(depthImage, depth_base + ivec2(x, y)).r;
float diff = depths1[x] - depths0[x];
casts_downward += (diff > 0.0) ? (1 << x) : 0;
casts_upward += (diff < 0.0) ? (1 << x) : 0;
}
*/
for (uint x = 0; x < kNeighborhoodWidth; x += 4) {
depths0[x] = depths1[x];
depths0[x + 1] = depths1[x + 1];
depths0[x + 2] = depths1[x + 2];
depths0[x + 3] = depths1[x + 3];
depths1[x] = texture(depthImage, depth_base + ivec2(x, y)).r;
depths1[x + 1] = texture(depthImage, depth_base + ivec2(x + 1, y)).r;
depths1[x + 2] = texture(depthImage, depth_base + ivec2(x + 2, y)).r;
depths1[x + 3] = texture(depthImage, depth_base + ivec2(x + 3, y)).r;
vec4 diff = vec4(depths1[x], depths1[x + 1], depths1[x + 2], depths1[x + 3]) -
vec4(depths0[x], depths0[x + 1], depths0[x + 2], depths0[x + 3]);
casts_downward += int(dot(ivec4(1, 2, 4, 8),
mix(ivec4(0, 0, 0, 0),
ivec4(1, 1, 1, 1),
greaterThan(diff, vec4(0, 0, 0, 0))))) << x;
casts_upward += int(dot(ivec4(1, 2, 4, 8),
mix(ivec4(0, 0, 0, 0),
ivec4(1, 1, 1, 1),
lessThan(diff, vec4(0, 0, 0, 0))))) << x;
}
roi[y - 1].g = casts_downward;
roi[y].r = casts_upward;
}
// Compute leftward/rightward shadow casting for current row.
{
int casts_rightward = 0;
int casts_leftward = 0;
/* What follows is a vectorized version of this code:
for (uint x = 1; x < kNeighborhoodWidth; ++x) {
float diff = depths1[(x - 1)] - depths1[x];
casts_rightward += (diff < 0.0) ? (1 << x) : 0;
casts_leftward += (diff > 0.0) ? (2 << x) : 0;
}
*/
vec4 diff1 = vec4(depths1[0], depths1[1], depths1[2], depths1[3]);
vec4 diff2 = vec4(depths1[1], depths1[2], depths1[3], depths1[4]);
diff1 -= diff2;
casts_rightward += int(dot(ivec4(1, 2, 4, 8),
mix(ivec4(0, 0, 0, 0),
ivec4(1, 1, 1, 1),
lessThan(diff1, vec4(0, 0, 0, 0)))));
casts_leftward += int(dot(ivec4(1, 2, 4, 8),
mix(ivec4(0, 0, 0, 0),
ivec4(1, 1, 1, 1),
greaterThan(diff1, vec4(0, 0, 0, 0))))) << 1;
diff1 = vec4(depths1[4], depths1[5], depths1[6], depths1[7]);
diff2 = vec4(depths1[5], depths1[6], depths1[7], depths1[8]);
diff1 -= diff2;
casts_rightward += int(dot(ivec4(1, 2, 4, 8),
mix(ivec4(0, 0, 0, 0),
ivec4(1, 1, 1, 1),
lessThan(diff1, vec4(0, 0, 0, 0))))) << 4;
casts_leftward += int(dot(ivec4(1, 2, 4, 8),
mix(ivec4(0, 0, 0, 0),
ivec4(1, 1, 1, 1),
greaterThan(diff1, vec4(0, 0, 0, 0))))) << 5;
diff1 = vec4(depths1[8], depths1[9], depths1[10], depths1[11]);
diff2 = vec4(diff1.gba, depths1[12]);
diff1 -= diff2;
casts_rightward += int(dot(ivec4(1, 2, 4, 8),
mix(ivec4(0, 0, 0, 0),
ivec4(1, 1, 1, 1),
lessThan(diff1, vec4(0, 0, 0, 0))))) << 8;
casts_leftward += int(dot(ivec4(1, 2, 4, 8),
mix(ivec4(0, 0, 0, 0),
ivec4(1, 1, 1, 1),
greaterThan(diff1, vec4(0, 0, 0, 0))))) << 9;
diff1 = vec4(depths1[12], depths1[13], depths1[14], depths1[15]);
// Note that the last value is repeated, to not go out-of-bounds.
diff2 = vec4(depths1[13], depths1[14], depths1[15], depths1[15]);
diff1 -= diff2;
casts_rightward += int(dot(ivec4(1, 2, 4, 8),
mix(ivec4(0, 0, 0, 0),
ivec4(1, 1, 1, 1),
lessThan(diff1, vec4(0, 0, 0, 0))))) << 12;
casts_leftward += int(dot(ivec4(1, 2, 4, 8),
mix(ivec4(0, 0, 0, 0),
ivec4(1, 1, 1, 1),
greaterThan(diff1, vec4(0, 0, 0, 0))))) << 13;
roi[y].b = casts_leftward;
roi[y].a = casts_rightward;
}
}
}
void smearNeighborRelationships() {
// Smear 'downward' to cast shadows even further 'downward'.
// Count downward so that we don't smear already-smeared values.
for (int y = kEight - 1; y >= 0; --y) {
ivec4 smeared = roi[kRadius + y];
for (uint rad = 1; rad < kRadius; ++rad) {
smeared.gba |= roi[kRadius + y - rad].gba;
}
roi[kRadius + y] = smeared;
}
// Smear 'upward' to cast shadows even further 'upward'.
for (uint y = 0; y < kEight; ++y) {
ivec3 smeared = roi[kRadius + y].rba;
for (uint rad = 1; rad < kRadius; ++rad) {
smeared |= roi[kRadius + y + rad].rba;
}
roi[kRadius + y].rba = smeared;
}
// Smear 'rightward' to cast shadows even further 'rightward', and similarly
// for leftward.
for (uint y = 0; y < kEight; ++y) {
ivec4 smeared = roi[kRadius + y];
for (uint rad = 1; rad < kRadius; ++rad) {
// Smear 'upward' and 'downward' bits to left and right.
// Smear 'leftward' bits to left only, to avoid false positives.
// Smear 'rightward' bits to right only, to avoid false positives.
smeared.rgb |= (smeared.rgb >> 1);
smeared.rga |= (smeared.rga << 1);
}
roi[kRadius + y] = smeared;
}
}
void main() {
computeNeighborRelationships();
smearNeighborRelationships();
ivec2 base = ivec2(gl_GlobalInvocationID.xy) * 2;
for (int y = 0; y < 2; ++y) {
ivec4 up_down_row = ivec4(
(roi[y * 4 + kRadius].r | roi[y * 4 + kRadius].g) >> kRadius,
(roi[y * 4 + kRadius + 1].r | roi[y * 4 + kRadius + 1].g) >> kRadius,
(roi[y * 4 + kRadius + 2].r | roi[y * 4 + kRadius + 2].g) >> kRadius,
(roi[y * 4 + kRadius + 3].r | roi[y * 4 + kRadius + 3].g) >> kRadius);
ivec4 left_right_row = ivec4(
(roi[y * 4 + kRadius].b | roi[y * 4 + kRadius].a) >> kRadius,
(roi[y * 4 + kRadius + 1].b | roi[y * 4 + kRadius + 1].a) >> kRadius,
(roi[y * 4 + kRadius + 2].b | roi[y * 4 + kRadius + 2].a) >> kRadius,
(roi[y * 4 + kRadius + 3].b | roi[y * 4 + kRadius + 3].a) >> kRadius);
ivec4 left_row = ivec4(0, 0, 0, 0);
ivec4 right_row = ivec4(0, 0, 0, 0);
for (int xx = 0; xx < 4; ++xx) {
for (int yy = 0; yy < 4; ++yy) {
left_row[yy] += (up_down_row[yy] & (1 << xx)) > 0 ? (1 << (xx * 2)) : 0;
left_row[yy] += (left_right_row[yy] & (1 << xx)) > 0 ? (1 << (xx * 2 + 1)) : 0;
right_row[yy] += (up_down_row[yy] & (1 << (xx + 4))) > 0 ? (1 << (xx * 2)) : 0;
right_row[yy] += (left_right_row[yy] & (1 << (xx + 4))) > 0 ? (1 << (xx * 2 + 1)) : 0;
}
}
imageStore(resultImage, base + ivec2(0, y), vec4(left_row) / 255.0);
imageStore(resultImage, base + ivec2(1, y), vec4(right_row) / 255.0);
}
}
)GLSL";
constexpr char g_null_kernel_src[] = R"GLSL(
#version 450
#extension GL_ARB_separate_shader_objects : enable
layout (binding = 0) uniform sampler2D depthImage;
layout (binding = 1, rgba8) uniform image2D resultImage;
void main() {
ivec2 base = ivec2(gl_GlobalInvocationID.xy) * 2;
for (int y = 0; y < 2; ++y) {
imageStore(resultImage, base + ivec2(0, y), vec4(1.0, 1.0, 1.0, 1.0));
imageStore(resultImage, base + ivec2(1, y), vec4(1.0, 1.0, 1.0, 1.0));
}
}
)GLSL";
constexpr char g_unpack_kernel_src[] = R"GLSL(
#version 450
#extension GL_ARB_separate_shader_objects : enable
layout (binding = 0, rgba8) uniform image2D tmpImage;
layout (binding = 1, rgba8) uniform image2D resultImage;
const int kSize = 8;
void main() {
uint x = gl_GlobalInvocationID.x * kSize;
uint y = gl_GlobalInvocationID.y * kSize;
for (uint i = 0; i < kSize / 4; ++i) {
for (uint j = 0; j < kSize / 4; ++j) {
// TODO: can we directly load ivec4?
ivec4 block = ivec4(imageLoad(tmpImage, ivec2(x / 4 + i, y / 4 + j)) * 255.f);
for (uint xx = 0; xx < 4; ++xx) {
for (uint yy = 0; yy < 4; ++yy) {
imageStore(resultImage,
ivec2(x + i * 4 + xx, y + j * 4 + yy),
vec4(
((block[yy] >> (2 * xx)) & 1),
((block[yy] >> (2 * xx + 1)) & 1),
0.f, 1.f));
}
}
}
}
}
)GLSL";
} // namespace
SsdoAccelerator::SsdoAccelerator(EscherWeakPtr escher,
ImageFactory* image_factory)
: escher_(std::move(escher)), image_factory_(image_factory) {}
SsdoAccelerator::~SsdoAccelerator() {}
const VulkanContext& SsdoAccelerator::vulkan_context() const {
return escher_->vulkan_context();
}
TexturePtr SsdoAccelerator::GenerateLookupTable(
const FramePtr& frame, const TexturePtr& depth_texture,
vk::ImageUsageFlags image_flags) {
if (!enabled_) {
return GenerateNullLookupTable(frame, depth_texture, image_flags);
}
TRACE_DURATION("gfx", "escher::SsdoAccelerator::GenerateLookupTable");
uint32_t width = depth_texture->width();
uint32_t height = depth_texture->height();
auto command_buffer = frame->command_buffer();
// Size of neighborhood of pixels to work on for each invocation of the
// compute kernel. Must match the value in the compute shader source code,
// and be a multiple of 4.
constexpr uint32_t kSize = 8;
uint32_t work_groups_x = width / kSize + (width % kSize > 0 ? 1 : 0);
uint32_t work_groups_y = height / kSize + (height % kSize > 0 ? 1 : 0);
uint32_t packed_width = width / 4 + (width % kSize > 0 ? 1 : 0);
uint32_t packed_height = height / 4 + (height % kSize > 0 ? 1 : 0);
ImagePtr tmp_image = image_factory_->NewImage(
{vk::Format::eR8G8B8A8Unorm, packed_width, packed_height, 1,
image_flags | vk::ImageUsageFlagBits::eStorage});
TexturePtr tmp_texture = fxl::MakeRefCounted<Texture>(
escher_->resource_recycler(), tmp_image, vk::Filter::eNearest,
vk::ImageAspectFlagBits::eColor, true);
command_buffer->TransitionImageLayout(tmp_image, vk::ImageLayout::eUndefined,
vk::ImageLayout::eGeneral);
if (!kernel_) {
FXL_VLOG(1) << "Lazily instantiating kernel_";
kernel_ = std::make_unique<ComputeShader>(
escher_,
std::vector<vk::ImageLayout>{vk::ImageLayout::eShaderReadOnlyOptimal,
vk::ImageLayout::eGeneral},
std::vector<vk::DescriptorType>{}, 0, g_kernel_src);
}
kernel_->Dispatch({depth_texture, tmp_texture}, {}, command_buffer,
work_groups_x, work_groups_y, 1, nullptr);
frame->AddTimestamp("generated SSDO acceleration lookup table");
return tmp_texture;
}
TexturePtr SsdoAccelerator::GenerateNullLookupTable(
const FramePtr& frame, const TexturePtr& depth_texture,
vk::ImageUsageFlags image_flags) {
uint32_t width = depth_texture->width();
uint32_t height = depth_texture->height();
auto command_buffer = frame->command_buffer();
// Size of neighborhood of pixels to work on for each invocation of the
// compute kernel. Must match the value in the compute shader source code,
// and be a multiple of 4.
constexpr uint32_t kSize = 8;
uint32_t work_groups_x = width / kSize + (width % kSize > 0 ? 1 : 0);
uint32_t work_groups_y = height / kSize + (height % kSize > 0 ? 1 : 0);
uint32_t packed_width = width / 4 + (width % kSize > 0 ? 1 : 0);
uint32_t packed_height = height / 4 + (height % kSize > 0 ? 1 : 0);
ImagePtr tmp_image = image_factory_->NewImage(
{vk::Format::eR8G8B8A8Unorm, packed_width, packed_height, 1,
image_flags | vk::ImageUsageFlagBits::eStorage});
TexturePtr tmp_texture = fxl::MakeRefCounted<Texture>(
escher_->resource_recycler(), tmp_image, vk::Filter::eNearest,
vk::ImageAspectFlagBits::eColor, true);
command_buffer->TransitionImageLayout(tmp_image, vk::ImageLayout::eUndefined,
vk::ImageLayout::eGeneral);
if (!null_kernel_) {
FXL_DLOG(INFO) << "Lazily instantiating null_kernel_";
null_kernel_ = std::make_unique<ComputeShader>(
escher_,
std::vector<vk::ImageLayout>{vk::ImageLayout::eShaderReadOnlyOptimal,
vk::ImageLayout::eGeneral},
std::vector<vk::DescriptorType>{}, 0, g_null_kernel_src);
}
null_kernel_->Dispatch({depth_texture, tmp_texture}, {}, command_buffer,
work_groups_x, work_groups_y, 1, nullptr);
frame->AddTimestamp("generated null SSDO acceleration lookup table");
return tmp_texture;
}
TexturePtr SsdoAccelerator::UnpackLookupTable(
const FramePtr& frame, const TexturePtr& packed_lookup_table,
uint32_t width, uint32_t height) {
constexpr uint32_t kSize = 8;
FXL_DCHECK(width <= packed_lookup_table->width() * 4);
FXL_DCHECK(height <= packed_lookup_table->height() * 4);
FXL_DCHECK(width + kSize > packed_lookup_table->width() * 4);
FXL_DCHECK(height + kSize > packed_lookup_table->height() * 4);
auto command_buffer = frame->command_buffer();
ImagePtr result_image =
image_factory_->NewImage({vk::Format::eR8G8B8A8Unorm, width, height, 1,
vk::ImageUsageFlagBits::eStorage |
vk::ImageUsageFlagBits::eTransferSrc});
command_buffer->TransitionImageLayout(
result_image, vk::ImageLayout::eUndefined, vk::ImageLayout::eGeneral);
TexturePtr result_texture = fxl::MakeRefCounted<Texture>(
escher_->resource_recycler(), result_image, vk::Filter::eNearest,
vk::ImageAspectFlagBits::eColor);
uint32_t work_groups_x = width / kSize + (width % kSize > 0 ? 1 : 0);
uint32_t work_groups_y = height / kSize + (height % kSize > 0 ? 1 : 0);
if (!unpack_kernel_) {
FXL_DLOG(INFO) << "Lazily instantiating unpack_kernel_";
unpack_kernel_ = std::make_unique<ComputeShader>(
escher_,
std::vector<vk::ImageLayout>{vk::ImageLayout::eGeneral,
vk::ImageLayout::eGeneral},
std::vector<vk::DescriptorType>{}, 0, g_unpack_kernel_src);
}
unpack_kernel_->Dispatch({packed_lookup_table, result_texture}, {},
command_buffer, work_groups_x, work_groups_y, 1,
nullptr);
frame->AddTimestamp(
"finished unpacking SSDO acceleration table for debug visualization");
return result_texture;
}
} // namespace impl
} // namespace escher