| #version 450 |
| |
| #extension GL_EXT_buffer_reference : require |
| #extension GL_EXT_buffer_reference_uvec2 : require |
| |
| layout(local_size_x = 64) in; |
| |
| layout(push_constant) uniform _push { |
| uvec2 image_address; |
| uvec2 scratch_address; |
| uvec2 aux_address; |
| uint body_offset; |
| uint block_count; |
| }; |
| |
| layout(buffer_reference, std430, buffer_reference_align = 16) buffer image_buffer { |
| uvec4 data[]; |
| }; |
| |
| layout(buffer_reference, std430, buffer_reference_align = 4) buffer aux_buffer { |
| uint size_bytes; |
| }; |
| |
| // AFBC constants. |
| const uint kAfbcTilePixelWidth = 16; |
| const uint kAfbcTilePixelHeight = 16; |
| const uint kAfbcTilePixels = kAfbcTilePixelWidth * kAfbcTilePixelHeight; |
| const uint kAfbcPixelsPerUvec4 = 4; |
| const uint kAfbcTileUvec4s = kAfbcTilePixels / kAfbcPixelsPerUvec4; |
| const uint kAfbcSuperblockTileWidth = 8; |
| const uint kAfbcSuperblockTileHeight = 8; |
| const uint kAfbcSuperblockTileCount = kAfbcSuperblockTileWidth * kAfbcSuperblockTileHeight; |
| |
| // Type definitions. |
| const uint kInvalidDuplicate = kAfbcSuperblockTileCount; |
| |
| // Shared memory used to deduplicate tiles. |
| // |
| // Total shared memory used: |
| // (8 * 8) * sizeof(uint) * 4 = 1024 bytes |
| // |
| shared uint hashes[kAfbcSuperblockTileCount]; |
| shared uint body_sizes[kAfbcSuperblockTileCount]; |
| shared uint duplicates[kAfbcSuperblockTileCount]; |
| shared uint tile_starts[kAfbcSuperblockTileCount]; |
| |
| // Shared memory where base offset for tile data is stored. |
| shared uint next_tile_start; |
| |
| uint subtileSize(uint subtile) |
| { |
| return subtile == 1 ? 64 : subtile; |
| } |
| |
| void calculateHashAndBodySize(uint tile_base_idx, uint i) |
| { |
| restrict image_buffer image = image_buffer(image_address); |
| |
| uint tile_idx = tile_base_idx + i; |
| |
| // Get tile start from header and divide by 16 to get th offset in |
| // uvec4s. Tile start is expected to be a multiple of the tile |
| // size (1024 bytes) so it's safe to divide by 16. |
| uint tile_start = image.data[tile_idx].x / 16; |
| |
| uint hash = 0; |
| uint body_size = 0; |
| |
| // Calculate hash and determine size of body for non-solid tiles. |
| if (tile_start != 0) |
| { |
| uint h1 = image.data[tile_idx].y; |
| uint h2 = image.data[tile_idx].z; |
| uint h3 = image.data[tile_idx].w; |
| |
| // Extract the size of each subtile from the header. |
| body_size += subtileSize(h1 & 0x3f); |
| body_size += subtileSize((h1 >> 6) & 0x3f); |
| body_size += subtileSize((h1 >> 12) & 0x3f); |
| body_size += subtileSize((h1 >> 18) & 0x3f); |
| body_size += subtileSize((h1 >> 24) & 0x3f); |
| body_size += subtileSize((h1 >> 30) | (h2 & 0xf) << 2); |
| body_size += subtileSize((h2 >> 4) & 0x3f); |
| body_size += subtileSize((h2 >> 10) & 0x3f); |
| body_size += subtileSize((h2 >> 16) & 0x3f); |
| body_size += subtileSize((h2 >> 22) & 0x3f); |
| body_size += subtileSize((h2 >> 28) | (h3 & 0x3) << 4); |
| body_size += subtileSize((h3 >> 2) & 0x3f); |
| body_size += subtileSize((h3 >> 8) & 0x3f); |
| body_size += subtileSize((h3 >> 14) & 0x3f); |
| body_size += subtileSize((h3 >> 20) & 0x3f); |
| body_size += subtileSize((h3 >> 26) & 0x3f); |
| |
| // Roundup to number of uvec4s. |
| body_size = (body_size + 15) / 16; |
| |
| uint tile_offset = tile_start; |
| uint tile_end = tile_start + body_size; |
| |
| while (tile_offset < tile_end) |
| { |
| uvec4 data = image.data[tile_offset]; |
| |
| hash = (hash * 31) ^ data.x; |
| hash = (hash * 31) ^ data.y; |
| hash = (hash * 31) ^ data.z; |
| hash = (hash * 31) ^ data.w; |
| |
| tile_offset++; |
| } |
| } |
| |
| // Store results in shared memory. |
| hashes[i] = hash; |
| body_sizes[i] = body_size; |
| } |
| |
| void dedupReserveSpaceAndCopyToScratch(uint tile_base_idx, uint i) |
| { |
| restrict image_buffer image = image_buffer(image_address); |
| |
| uint tile_idx = tile_base_idx + i; |
| |
| // Get unpacked tile start from header. |
| uint tile_start = image.data[tile_idx].x / 16; |
| |
| // Load hash and body size from shared memory. |
| uint hash = hashes[i]; |
| uint body_size = body_sizes[i]; |
| |
| uint duplicate = kInvalidDuplicate; |
| uint packed_tile_start = 0; |
| |
| // Try to deduplicate non-solid tiles. |
| if (body_size != 0) |
| { |
| // Linear scan through tiles with lower index to find a match. There |
| // are a number of ways this can be optimized as failing to find a |
| // duplicate when one exists is not an error (fxb/64177). |
| uint j = i; |
| while (j > 0) |
| { |
| j--; |
| |
| // Consider tile if hash and body size matches. |
| if (hashes[j] == hash && body_sizes[j] == body_size) |
| { |
| uint other_tile_idx = tile_base_idx + j; |
| |
| // Get other unpacked tile start from header. |
| uint other_tile_start = image.data[other_tile_idx].x / 16; |
| |
| // Compare tile bodies to determine if this is an exact match. |
| uint k = 0; |
| while (k < body_size) |
| { |
| if (image.data[other_tile_start + k] != image.data[tile_start + k]) |
| { |
| break; |
| } |
| k++; |
| } |
| |
| // Set duplicate if bodies matched. |
| if (k == body_size) |
| { |
| duplicate = j; |
| break; |
| } |
| } |
| } |
| |
| // Copy tile and acquire a start offset unless tile is a duplicate. |
| if (duplicate == kInvalidDuplicate) |
| { |
| restrict image_buffer scratch = image_buffer(scratch_address); |
| uint scratch_tile_start = i * kAfbcTileUvec4s; |
| |
| // Copy tile body. |
| for (uint k = 0; k < body_size; k++) |
| { |
| scratch.data[scratch_tile_start + k] = image.data[tile_start + k]; |
| } |
| packed_tile_start = atomicAdd(next_tile_start, body_size); |
| } |
| } |
| |
| // Store results in shared memory. |
| duplicates[i] = duplicate; |
| tile_starts[i] = packed_tile_start; |
| } |
| |
| void resolveAndWriteOut(uint tile_base_idx, uint i) |
| { |
| restrict image_buffer image = image_buffer(image_address); |
| |
| uint tile_idx = tile_base_idx + i; |
| |
| // Get tile header. |
| uvec4 header = image.data[tile_idx]; |
| |
| // Load packed tile start and duplicate from shared memory. |
| uint tile_start = tile_starts[i]; |
| uint duplicate = duplicates[i]; |
| |
| // Copy tile body unless tile is a duplicate. |
| if (duplicate == kInvalidDuplicate) |
| { |
| restrict image_buffer scratch = image_buffer(scratch_address); |
| uint scratch_tile_start = i * kAfbcTileUvec4s; |
| |
| // Load body size from shared memory. |
| uint body_size = body_sizes[i]; |
| |
| // Copy tile body. |
| for (uint j = 0; j < body_size; j++) |
| { |
| image.data[tile_start + j] = scratch.data[scratch_tile_start + j]; |
| } |
| } |
| else |
| { |
| // Find tile start by resolving duplicates. |
| do |
| { |
| tile_start = tile_starts[duplicate]; |
| duplicate = duplicates[duplicate]; |
| } |
| while (duplicate != kInvalidDuplicate); |
| } |
| |
| // Write tile start and rest of header. |
| image.data[tile_idx] = uvec4(tile_start * 16, header.yzw); |
| } |
| |
| void processBlock(uint block) |
| { |
| uint start_tile = block * kAfbcSuperblockTileCount; |
| |
| // |
| // Stage 1: Calculate hashes and body sizes. |
| // |
| for (uint i = gl_LocalInvocationID.x; i < kAfbcSuperblockTileCount; i += gl_WorkGroupSize.x) |
| { |
| calculateHashAndBodySize(start_tile, i); |
| } |
| |
| // Ensure all threads in work group have executed statements |
| // above before we proceed to next stage. |
| barrier(); |
| |
| // |
| // Stage 2: Deduplicate, reserve space, and copy tiles to staging memory. |
| // |
| for (uint i = gl_LocalInvocationID.x; i < kAfbcSuperblockTileCount; i += gl_WorkGroupSize.x) |
| { |
| dedupReserveSpaceAndCopyToScratch(start_tile, i); |
| } |
| |
| // Ensure all threads in work group have executed statements |
| // above before we proceed to next stage. |
| barrier(); |
| |
| // |
| // Stage 3: Resolve duplicates and write out tiles to image from staging memory. |
| // |
| for (uint i = gl_LocalInvocationID.x; i < kAfbcSuperblockTileCount; i += gl_WorkGroupSize.x) |
| { |
| resolveAndWriteOut(start_tile, i); |
| } |
| } |
| |
| void main() |
| { |
| // Initialize tile start offset to body offset. |
| if (gl_LocalInvocationID.x == 0) |
| { |
| next_tile_start = body_offset / 16; |
| } |
| |
| // Process all blocks in forward sequence. The order is important |
| // as we can't write to earlier block memory until they've been |
| // processed. |
| for (uint i = 0; i < block_count; ++i) |
| { |
| processBlock(i); |
| |
| // Note: no barrier is needed here as last processing stage can run |
| // in parallel with first stage. |
| } |
| |
| // Write the final size in bytes to auxiliary buffer. |
| if (gl_LocalInvocationID.x == 0) |
| { |
| restrict aux_buffer aux = aux_buffer(aux_address); |
| aux.size_bytes = next_tile_start * 16; |
| } |
| } |