| /* |
| * Copyright © 2020 Raspberry Pi Ltd |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| */ |
| |
| #include "v3dv_private.h" |
| |
| #include "util/timespec.h" |
| #include "compiler/nir/nir_builder.h" |
| |
| static const char *v3dv_counters[][3] = { |
| {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"}, |
| {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"}, |
| {"FEP", "FEP-clipped-quads", "[FEP] Early-Z/Near/Far clipped quads"}, |
| {"FEP", "FEP-valid-quads", "[FEP] Valid quads"}, |
| {"TLB", "TLB-quads-not-passing-stencil-test", "[TLB] Quads with no pixels passing the stencil test"}, |
| {"TLB", "TLB-quads-not-passing-z-and-stencil-test", "[TLB] Quads with no pixels passing the Z and stencil tests"}, |
| {"TLB", "TLB-quads-passing-z-and-stencil-test", "[TLB] Quads with any pixels passing the Z and stencil tests"}, |
| {"TLB", "TLB-quads-with-zero-coverage", "[TLB] Quads with all pixels having zero coverage"}, |
| {"TLB", "TLB-quads-with-non-zero-coverage", "[TLB] Quads with any pixels having non-zero coverage"}, |
| {"TLB", "TLB-quads-written-to-color-buffer", "[TLB] Quads with valid pixels written to colour buffer"}, |
| {"PTB", "PTB-primitives-discarded-outside-viewport", "[PTB] Primitives discarded by being outside the viewport"}, |
| {"PTB", "PTB-primitives-need-clipping", "[PTB] Primitives that need clipping"}, |
| {"PTB", "PTB-primitives-discared-reversed", "[PTB] Primitives that are discarded because they are reversed"}, |
| {"QPU", "QPU-total-idle-clk-cycles", "[QPU] Total idle clock cycles for all QPUs"}, |
| {"QPU", "QPU-total-active-clk-cycles-vertex-coord-shading", "[QPU] Total active clock cycles for all QPUs doing vertex/coordinate/user shading (counts only when QPU is not stalled)"}, |
| {"QPU", "QPU-total-active-clk-cycles-fragment-shading", "[QPU] Total active clock cycles for all QPUs doing fragment shading (counts only when QPU is not stalled)"}, |
| {"QPU", "QPU-total-clk-cycles-executing-valid-instr", "[QPU] Total clock cycles for all QPUs executing valid instructions"}, |
| {"QPU", "QPU-total-clk-cycles-waiting-TMU", "[QPU] Total clock cycles for all QPUs stalled waiting for TMUs only (counter won't increment if QPU also stalling for another reason)"}, |
| {"QPU", "QPU-total-clk-cycles-waiting-scoreboard", "[QPU] Total clock cycles for all QPUs stalled waiting for Scoreboard only (counter won't increment if QPU also stalling for another reason)"}, |
| {"QPU", "QPU-total-clk-cycles-waiting-varyings", "[QPU] Total clock cycles for all QPUs stalled waiting for Varyings only (counter won't increment if QPU also stalling for another reason)"}, |
| {"QPU", "QPU-total-instr-cache-hit", "[QPU] Total instruction cache hits for all slices"}, |
| {"QPU", "QPU-total-instr-cache-miss", "[QPU] Total instruction cache misses for all slices"}, |
| {"QPU", "QPU-total-uniform-cache-hit", "[QPU] Total uniforms cache hits for all slices"}, |
| {"QPU", "QPU-total-uniform-cache-miss", "[QPU] Total uniforms cache misses for all slices"}, |
| {"TMU", "TMU-total-text-quads-access", "[TMU] Total texture cache accesses"}, |
| {"TMU", "TMU-total-text-cache-miss", "[TMU] Total texture cache misses (number of fetches from memory/L2cache)"}, |
| {"VPM", "VPM-total-clk-cycles-VDW-stalled", "[VPM] Total clock cycles VDW is stalled waiting for VPM access"}, |
| {"VPM", "VPM-total-clk-cycles-VCD-stalled", "[VPM] Total clock cycles VCD is stalled waiting for VPM access"}, |
| {"CLE", "CLE-bin-thread-active-cycles", "[CLE] Bin thread active cycles"}, |
| {"CLE", "CLE-render-thread-active-cycles", "[CLE] Render thread active cycles"}, |
| {"L2T", "L2T-total-cache-hit", "[L2T] Total Level 2 cache hits"}, |
| {"L2T", "L2T-total-cache-miss", "[L2T] Total Level 2 cache misses"}, |
| {"CORE", "cycle-count", "[CORE] Cycle counter"}, |
| {"QPU", "QPU-total-clk-cycles-waiting-vertex-coord-shading", "[QPU] Total stalled clock cycles for all QPUs doing vertex/coordinate/user shading"}, |
| {"QPU", "QPU-total-clk-cycles-waiting-fragment-shading", "[QPU] Total stalled clock cycles for all QPUs doing fragment shading"}, |
| {"PTB", "PTB-primitives-binned", "[PTB] Total primitives binned"}, |
| {"AXI", "AXI-writes-seen-watch-0", "[AXI] Writes seen by watch 0"}, |
| {"AXI", "AXI-reads-seen-watch-0", "[AXI] Reads seen by watch 0"}, |
| {"AXI", "AXI-writes-stalled-seen-watch-0", "[AXI] Write stalls seen by watch 0"}, |
| {"AXI", "AXI-reads-stalled-seen-watch-0", "[AXI] Read stalls seen by watch 0"}, |
| {"AXI", "AXI-write-bytes-seen-watch-0", "[AXI] Total bytes written seen by watch 0"}, |
| {"AXI", "AXI-read-bytes-seen-watch-0", "[AXI] Total bytes read seen by watch 0"}, |
| {"AXI", "AXI-writes-seen-watch-1", "[AXI] Writes seen by watch 1"}, |
| {"AXI", "AXI-reads-seen-watch-1", "[AXI] Reads seen by watch 1"}, |
| {"AXI", "AXI-writes-stalled-seen-watch-1", "[AXI] Write stalls seen by watch 1"}, |
| {"AXI", "AXI-reads-stalled-seen-watch-1", "[AXI] Read stalls seen by watch 1"}, |
| {"AXI", "AXI-write-bytes-seen-watch-1", "[AXI] Total bytes written seen by watch 1"}, |
| {"AXI", "AXI-read-bytes-seen-watch-1", "[AXI] Total bytes read seen by watch 1"}, |
| {"TLB", "TLB-partial-quads-written-to-color-buffer", "[TLB] Partial quads written to the colour buffer"}, |
| {"TMU", "TMU-total-config-access", "[TMU] Total config accesses"}, |
| {"L2T", "L2T-no-id-stalled", "[L2T] No ID stall"}, |
| {"L2T", "L2T-command-queue-stalled", "[L2T] Command queue full stall"}, |
| {"L2T", "L2T-TMU-writes", "[L2T] TMU write accesses"}, |
| {"TMU", "TMU-active-cycles", "[TMU] Active cycles"}, |
| {"TMU", "TMU-stalled-cycles", "[TMU] Stalled cycles"}, |
| {"CLE", "CLE-thread-active-cycles", "[CLE] Bin or render thread active cycles"}, |
| {"L2T", "L2T-TMU-reads", "[L2T] TMU read accesses"}, |
| {"L2T", "L2T-CLE-reads", "[L2T] CLE read accesses"}, |
| {"L2T", "L2T-VCD-reads", "[L2T] VCD read accesses"}, |
| {"L2T", "L2T-TMU-config-reads", "[L2T] TMU CFG read accesses"}, |
| {"L2T", "L2T-SLC0-reads", "[L2T] SLC0 read accesses"}, |
| {"L2T", "L2T-SLC1-reads", "[L2T] SLC1 read accesses"}, |
| {"L2T", "L2T-SLC2-reads", "[L2T] SLC2 read accesses"}, |
| {"L2T", "L2T-TMU-write-miss", "[L2T] TMU write misses"}, |
| {"L2T", "L2T-TMU-read-miss", "[L2T] TMU read misses"}, |
| {"L2T", "L2T-CLE-read-miss", "[L2T] CLE read misses"}, |
| {"L2T", "L2T-VCD-read-miss", "[L2T] VCD read misses"}, |
| {"L2T", "L2T-TMU-config-read-miss", "[L2T] TMU CFG read misses"}, |
| {"L2T", "L2T-SLC0-read-miss", "[L2T] SLC0 read misses"}, |
| {"L2T", "L2T-SLC1-read-miss", "[L2T] SLC1 read misses"}, |
| {"L2T", "L2T-SLC2-read-miss", "[L2T] SLC2 read misses"}, |
| {"CORE", "core-memory-writes", "[CORE] Total memory writes"}, |
| {"L2T", "L2T-memory-writes", "[L2T] Total memory writes"}, |
| {"PTB", "PTB-memory-writes", "[PTB] Total memory writes"}, |
| {"TLB", "TLB-memory-writes", "[TLB] Total memory writes"}, |
| {"CORE", "core-memory-reads", "[CORE] Total memory reads"}, |
| {"L2T", "L2T-memory-reads", "[L2T] Total memory reads"}, |
| {"PTB", "PTB-memory-reads", "[PTB] Total memory reads"}, |
| {"PSE", "PSE-memory-reads", "[PSE] Total memory reads"}, |
| {"TLB", "TLB-memory-reads", "[TLB] Total memory reads"}, |
| {"GMP", "GMP-memory-reads", "[GMP] Total memory reads"}, |
| {"PTB", "PTB-memory-words-writes", "[PTB] Total memory words written"}, |
| {"TLB", "TLB-memory-words-writes", "[TLB] Total memory words written"}, |
| {"PSE", "PSE-memory-words-reads", "[PSE] Total memory words read"}, |
| {"TLB", "TLB-memory-words-reads", "[TLB] Total memory words read"}, |
| {"TMU", "TMU-MRU-hits", "[TMU] Total MRU hits"}, |
| {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"}, |
| }; |
| |
| static void |
| kperfmon_create(struct v3dv_device *device, |
| struct v3dv_query_pool *pool, |
| uint32_t query) |
| { |
| for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) { |
| assert(i * DRM_V3D_MAX_PERF_COUNTERS < pool->perfmon.ncounters); |
| |
| struct drm_v3d_perfmon_create req = { |
| .ncounters = MIN2(pool->perfmon.ncounters - |
| i * DRM_V3D_MAX_PERF_COUNTERS, |
| DRM_V3D_MAX_PERF_COUNTERS), |
| }; |
| memcpy(req.counters, |
| &pool->perfmon.counters[i * DRM_V3D_MAX_PERF_COUNTERS], |
| req.ncounters); |
| |
| int ret = v3dv_ioctl(device->pdevice->render_fd, |
| DRM_IOCTL_V3D_PERFMON_CREATE, |
| &req); |
| if (ret) |
| fprintf(stderr, "Failed to create perfmon: %s\n", strerror(ret)); |
| |
| pool->queries[query].perf.kperfmon_ids[i] = req.id; |
| } |
| } |
| |
| static void |
| kperfmon_destroy(struct v3dv_device *device, |
| struct v3dv_query_pool *pool, |
| uint32_t query) |
| { |
| /* Skip destroying if never created */ |
| if (!pool->queries[query].perf.kperfmon_ids[0]) |
| return; |
| |
| for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) { |
| struct drm_v3d_perfmon_destroy req = { |
| .id = pool->queries[query].perf.kperfmon_ids[i] |
| }; |
| |
| int ret = v3dv_ioctl(device->pdevice->render_fd, |
| DRM_IOCTL_V3D_PERFMON_DESTROY, |
| &req); |
| |
| if (ret) { |
| fprintf(stderr, "Failed to destroy perfmon %u: %s\n", |
| req.id, strerror(ret)); |
| } |
| } |
| } |
| |
| /** |
| * Creates a VkBuffer (and VkDeviceMemory) to access a BO. |
| */ |
| static VkResult |
| create_vk_storage_buffer(struct v3dv_device *device, |
| struct v3dv_bo *bo, |
| VkBuffer *vk_buf, |
| VkDeviceMemory *vk_mem) |
| { |
| VkDevice vk_device = v3dv_device_to_handle(device); |
| |
| VkBufferCreateInfo buf_info = { |
| .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, |
| .size = bo->size, |
| .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, |
| }; |
| VkResult result = v3dv_CreateBuffer(vk_device, &buf_info, NULL, vk_buf); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| struct v3dv_device_memory *mem = |
| vk_object_zalloc(&device->vk, NULL, sizeof(*mem), |
| VK_OBJECT_TYPE_DEVICE_MEMORY); |
| if (!mem) |
| return VK_ERROR_OUT_OF_HOST_MEMORY; |
| |
| mem->bo = bo; |
| mem->type = &device->pdevice->memory.memoryTypes[0]; |
| |
| *vk_mem = v3dv_device_memory_to_handle(mem); |
| VkBindBufferMemoryInfo bind_info = { |
| .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO, |
| .buffer = *vk_buf, |
| .memory = *vk_mem, |
| .memoryOffset = 0, |
| }; |
| v3dv_BindBufferMemory2(vk_device, 1, &bind_info); |
| |
| return VK_SUCCESS; |
| } |
| |
| static void |
| destroy_vk_storage_buffer(struct v3dv_device *device, |
| VkBuffer *vk_buf, |
| VkDeviceMemory *vk_mem) |
| { |
| if (*vk_mem) { |
| vk_object_free(&device->vk, NULL, v3dv_device_memory_from_handle(*vk_mem)); |
| *vk_mem = VK_NULL_HANDLE; |
| } |
| |
| v3dv_DestroyBuffer(v3dv_device_to_handle(device), *vk_buf, NULL); |
| *vk_buf = VK_NULL_HANDLE; |
| } |
| |
| /** |
| * Allocates descriptor sets to access query pool BO (availability and |
| * occlusion query results) from Vulkan pipelines. |
| */ |
| static VkResult |
| create_pool_descriptors(struct v3dv_device *device, |
| struct v3dv_query_pool *pool) |
| { |
| assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION); |
| VkDevice vk_device = v3dv_device_to_handle(device); |
| |
| VkDescriptorPoolSize pool_size = { |
| .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |
| .descriptorCount = 1, |
| }; |
| VkDescriptorPoolCreateInfo pool_info = { |
| .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, |
| .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, |
| .maxSets = 1, |
| .poolSizeCount = 1, |
| .pPoolSizes = &pool_size, |
| }; |
| VkResult result = |
| v3dv_CreateDescriptorPool(vk_device, &pool_info, NULL, |
| &pool->meta.descriptor_pool); |
| |
| if (result != VK_SUCCESS) |
| return result; |
| |
| VkDescriptorSetAllocateInfo alloc_info = { |
| .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, |
| .descriptorPool = pool->meta.descriptor_pool, |
| .descriptorSetCount = 1, |
| .pSetLayouts = &device->queries.buf_descriptor_set_layout, |
| }; |
| result = v3dv_AllocateDescriptorSets(vk_device, &alloc_info, |
| &pool->meta.descriptor_set); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| VkDescriptorBufferInfo desc_buf_info = { |
| .buffer = pool->meta.buf, |
| .offset = 0, |
| .range = VK_WHOLE_SIZE, |
| }; |
| |
| VkWriteDescriptorSet write = { |
| .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, |
| .dstSet = pool->meta.descriptor_set, |
| .dstBinding = 0, |
| .dstArrayElement = 0, |
| .descriptorCount = 1, |
| .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |
| .pBufferInfo = &desc_buf_info, |
| }; |
| v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL); |
| |
| return VK_SUCCESS; |
| } |
| |
| static void |
| destroy_pool_descriptors(struct v3dv_device *device, |
| struct v3dv_query_pool *pool) |
| { |
| assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION); |
| |
| v3dv_FreeDescriptorSets(v3dv_device_to_handle(device), |
| pool->meta.descriptor_pool, |
| 1, &pool->meta.descriptor_set); |
| pool->meta.descriptor_set = VK_NULL_HANDLE; |
| |
| v3dv_DestroyDescriptorPool(v3dv_device_to_handle(device), |
| pool->meta.descriptor_pool, NULL); |
| pool->meta.descriptor_pool = VK_NULL_HANDLE; |
| } |
| |
| static VkResult |
| pool_create_meta_resources(struct v3dv_device *device, |
| struct v3dv_query_pool *pool) |
| { |
| VkResult result; |
| |
| if (pool->query_type != VK_QUERY_TYPE_OCCLUSION) |
| return VK_SUCCESS; |
| |
| result = create_vk_storage_buffer(device, pool->occlusion.bo, |
| &pool->meta.buf, &pool->meta.mem); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| result = create_pool_descriptors(device, pool); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| return VK_SUCCESS; |
| } |
| |
| static void |
| pool_destroy_meta_resources(struct v3dv_device *device, |
| struct v3dv_query_pool *pool) |
| { |
| if (pool->query_type != VK_QUERY_TYPE_OCCLUSION) |
| return; |
| |
| destroy_pool_descriptors(device, pool); |
| destroy_vk_storage_buffer(device, &pool->meta.buf, &pool->meta.mem); |
| } |
| |
| VKAPI_ATTR VkResult VKAPI_CALL |
| v3dv_CreateQueryPool(VkDevice _device, |
| const VkQueryPoolCreateInfo *pCreateInfo, |
| const VkAllocationCallbacks *pAllocator, |
| VkQueryPool *pQueryPool) |
| { |
| V3DV_FROM_HANDLE(v3dv_device, device, _device); |
| |
| assert(pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION || |
| pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP || |
| pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR); |
| assert(pCreateInfo->queryCount > 0); |
| |
| struct v3dv_query_pool *pool = |
| vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool), |
| VK_OBJECT_TYPE_QUERY_POOL); |
| if (pool == NULL) |
| return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| |
| pool->query_type = pCreateInfo->queryType; |
| pool->query_count = pCreateInfo->queryCount; |
| |
| uint32_t query_idx = 0; |
| VkResult result; |
| |
| const uint32_t pool_bytes = sizeof(struct v3dv_query) * pool->query_count; |
| pool->queries = vk_alloc2(&device->vk.alloc, pAllocator, pool_bytes, 8, |
| VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); |
| if (pool->queries == NULL) { |
| result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| goto fail; |
| } |
| |
| switch (pool->query_type) { |
| case VK_QUERY_TYPE_OCCLUSION: { |
| /* The hardware allows us to setup groups of 16 queries in consecutive |
| * 4-byte addresses, requiring only that each group of 16 queries is |
| * aligned to a 1024 byte boundary. |
| */ |
| const uint32_t query_groups = DIV_ROUND_UP(pool->query_count, 16); |
| uint32_t bo_size = query_groups * 1024; |
| /* After the counters we store avalability data, 1 byte/query */ |
| pool->occlusion.avail_offset = bo_size; |
| bo_size += pool->query_count; |
| pool->occlusion.bo = v3dv_bo_alloc(device, bo_size, "query:o", true); |
| if (!pool->occlusion.bo) { |
| result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); |
| goto fail; |
| } |
| if (!v3dv_bo_map(device, pool->occlusion.bo, bo_size)) { |
| result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); |
| goto fail; |
| } |
| break; |
| } |
| case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { |
| const VkQueryPoolPerformanceCreateInfoKHR *pq_info = |
| vk_find_struct_const(pCreateInfo->pNext, |
| QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR); |
| |
| assert(pq_info); |
| assert(pq_info->counterIndexCount <= V3D_PERFCNT_NUM); |
| |
| pool->perfmon.ncounters = pq_info->counterIndexCount; |
| for (uint32_t i = 0; i < pq_info->counterIndexCount; i++) |
| pool->perfmon.counters[i] = pq_info->pCounterIndices[i]; |
| |
| pool->perfmon.nperfmons = DIV_ROUND_UP(pool->perfmon.ncounters, |
| DRM_V3D_MAX_PERF_COUNTERS); |
| |
| assert(pool->perfmon.nperfmons <= V3DV_MAX_PERFMONS); |
| break; |
| } |
| case VK_QUERY_TYPE_TIMESTAMP: |
| break; |
| default: |
| unreachable("Unsupported query type"); |
| } |
| |
| /* Initialize queries in the pool */ |
| for (; query_idx < pool->query_count; query_idx++) { |
| pool->queries[query_idx].maybe_available = false; |
| switch (pool->query_type) { |
| case VK_QUERY_TYPE_OCCLUSION: { |
| const uint32_t query_group = query_idx / 16; |
| const uint32_t query_offset = query_group * 1024 + (query_idx % 16) * 4; |
| pool->queries[query_idx].occlusion.offset = query_offset; |
| break; |
| } |
| case VK_QUERY_TYPE_TIMESTAMP: |
| pool->queries[query_idx].value = 0; |
| break; |
| case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { |
| result = vk_sync_create(&device->vk, |
| &device->pdevice->drm_syncobj_type, 0, 0, |
| &pool->queries[query_idx].perf.last_job_sync); |
| if (result != VK_SUCCESS) |
| goto fail; |
| |
| for (uint32_t j = 0; j < pool->perfmon.nperfmons; j++) |
| pool->queries[query_idx].perf.kperfmon_ids[j] = 0; |
| break; |
| } |
| default: |
| unreachable("Unsupported query type"); |
| } |
| } |
| |
| /* Create meta resources */ |
| result = pool_create_meta_resources(device, pool); |
| if (result != VK_SUCCESS) |
| goto fail; |
| |
| *pQueryPool = v3dv_query_pool_to_handle(pool); |
| |
| return VK_SUCCESS; |
| |
| fail: |
| if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { |
| for (uint32_t j = 0; j < query_idx; j++) |
| vk_sync_destroy(&device->vk, pool->queries[j].perf.last_job_sync); |
| } |
| |
| if (pool->occlusion.bo) |
| v3dv_bo_free(device, pool->occlusion.bo); |
| if (pool->queries) |
| vk_free2(&device->vk.alloc, pAllocator, pool->queries); |
| pool_destroy_meta_resources(device, pool); |
| vk_object_free(&device->vk, pAllocator, pool); |
| |
| return result; |
| } |
| |
| VKAPI_ATTR void VKAPI_CALL |
| v3dv_DestroyQueryPool(VkDevice _device, |
| VkQueryPool queryPool, |
| const VkAllocationCallbacks *pAllocator) |
| { |
| V3DV_FROM_HANDLE(v3dv_device, device, _device); |
| V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool); |
| |
| if (!pool) |
| return; |
| |
| if (pool->occlusion.bo) |
| v3dv_bo_free(device, pool->occlusion.bo); |
| |
| if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { |
| for (uint32_t i = 0; i < pool->query_count; i++) { |
| kperfmon_destroy(device, pool, i); |
| vk_sync_destroy(&device->vk, pool->queries[i].perf.last_job_sync); |
| } |
| } |
| |
| if (pool->queries) |
| vk_free2(&device->vk.alloc, pAllocator, pool->queries); |
| |
| pool_destroy_meta_resources(device, pool); |
| |
| vk_object_free(&device->vk, pAllocator, pool); |
| } |
| |
| static void |
| write_to_buffer(void *dst, uint32_t idx, bool do_64bit, uint64_t value) |
| { |
| if (do_64bit) { |
| uint64_t *dst64 = (uint64_t *) dst; |
| dst64[idx] = value; |
| } else { |
| uint32_t *dst32 = (uint32_t *) dst; |
| dst32[idx] = (uint32_t) value; |
| } |
| } |
| |
| static VkResult |
| query_wait_available(struct v3dv_device *device, |
| struct v3dv_query_pool *pool, |
| struct v3dv_query *q, |
| uint32_t query_idx) |
| { |
| /* For occlusion queries we prefer to poll the availability BO in a loop |
| * to waiting on the occlusion query results BO, because the latter would |
| * make us wait for any job running occlusion queries, even if those queries |
| * do not involve the one we want to wait on. |
| */ |
| if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) { |
| uint8_t *q_addr = ((uint8_t *) pool->occlusion.bo->map) + |
| pool->occlusion.avail_offset + query_idx; |
| while (*q_addr == 0) |
| usleep(250); |
| return VK_SUCCESS; |
| } |
| |
| /* For other queries we need to wait for the queue to signal that |
| * the query has been submitted for execution before anything else. |
| */ |
| assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR || |
| pool->query_type == VK_QUERY_TYPE_TIMESTAMP); |
| |
| VkResult result = VK_SUCCESS; |
| if (!q->maybe_available) { |
| struct timespec timeout; |
| timespec_get(&timeout, TIME_UTC); |
| timespec_add_msec(&timeout, &timeout, 2000); |
| |
| mtx_lock(&device->query_mutex); |
| while (!q->maybe_available) { |
| if (vk_device_is_lost(&device->vk)) { |
| result = VK_ERROR_DEVICE_LOST; |
| break; |
| } |
| |
| int ret = cnd_timedwait(&device->query_ended, |
| &device->query_mutex, |
| &timeout); |
| if (ret != thrd_success) { |
| mtx_unlock(&device->query_mutex); |
| result = vk_device_set_lost(&device->vk, "Query wait failed"); |
| break; |
| } |
| } |
| mtx_unlock(&device->query_mutex); |
| |
| if (result != VK_SUCCESS) |
| return result; |
| |
| /* For performance queries, we also need to wait for the relevant syncobj |
| * to be signaled to ensure completion of the GPU work. |
| */ |
| if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR && |
| vk_sync_wait(&device->vk, q->perf.last_job_sync, |
| 0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS) { |
| return vk_device_set_lost(&device->vk, "Query job wait failed"); |
| } |
| } |
| |
| return result; |
| } |
| |
| static VkResult |
| query_check_available(struct v3dv_device *device, |
| struct v3dv_query_pool *pool, |
| struct v3dv_query *q, |
| uint32_t query_idx) |
| { |
| /* For occlusion and performance queries we check the availability BO */ |
| if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) { |
| const uint8_t *q_addr = ((uint8_t *) pool->occlusion.bo->map) + |
| pool->occlusion.avail_offset + query_idx; |
| return (*q_addr != 0) ? VK_SUCCESS : VK_NOT_READY; |
| } |
| |
| /* For other queries we need to check if the queue has submitted the query |
| * for execution at all. |
| */ |
| assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR || |
| pool->query_type == VK_QUERY_TYPE_TIMESTAMP); |
| if (!q->maybe_available) |
| return VK_NOT_READY; |
| |
| /* For performance queries, we also need to check if the relevant GPU job |
| * has completed. |
| */ |
| if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR && |
| vk_sync_wait(&device->vk, q->perf.last_job_sync, |
| 0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS) { |
| return VK_NOT_READY; |
| } |
| |
| return VK_SUCCESS; |
| } |
| |
| static VkResult |
| query_is_available(struct v3dv_device *device, |
| struct v3dv_query_pool *pool, |
| uint32_t query, |
| bool do_wait, |
| bool *available) |
| { |
| struct v3dv_query *q = &pool->queries[query]; |
| |
| assert(pool->query_type != VK_QUERY_TYPE_OCCLUSION || |
| (pool->occlusion.bo && pool->occlusion.bo->map)); |
| |
| if (do_wait) { |
| VkResult result = query_wait_available(device, pool, q, query); |
| if (result != VK_SUCCESS) { |
| *available = false; |
| return result; |
| } |
| |
| *available = true; |
| } else { |
| VkResult result = query_check_available(device, pool, q, query); |
| assert(result == VK_SUCCESS || result == VK_NOT_READY); |
| *available = (result == VK_SUCCESS); |
| } |
| |
| return VK_SUCCESS; |
| } |
| |
| static VkResult |
| write_occlusion_query_result(struct v3dv_device *device, |
| struct v3dv_query_pool *pool, |
| uint32_t query, |
| bool do_64bit, |
| void *data, |
| uint32_t slot) |
| { |
| assert(pool && pool->query_type == VK_QUERY_TYPE_OCCLUSION); |
| |
| if (vk_device_is_lost(&device->vk)) |
| return VK_ERROR_DEVICE_LOST; |
| |
| struct v3dv_query *q = &pool->queries[query]; |
| assert(pool->occlusion.bo && pool->occlusion.bo->map); |
| |
| const uint8_t *query_addr = |
| ((uint8_t *) pool->occlusion.bo->map) + q->occlusion.offset; |
| write_to_buffer(data, slot, do_64bit, (uint64_t) *((uint32_t *)query_addr)); |
| return VK_SUCCESS; |
| } |
| |
| static VkResult |
| write_timestamp_query_result(struct v3dv_device *device, |
| struct v3dv_query_pool *pool, |
| uint32_t query, |
| bool do_64bit, |
| void *data, |
| uint32_t slot) |
| { |
| assert(pool && pool->query_type == VK_QUERY_TYPE_TIMESTAMP); |
| |
| struct v3dv_query *q = &pool->queries[query]; |
| |
| write_to_buffer(data, slot, do_64bit, q->value); |
| return VK_SUCCESS; |
| } |
| |
| static VkResult |
| write_performance_query_result(struct v3dv_device *device, |
| struct v3dv_query_pool *pool, |
| uint32_t query, |
| bool do_64bit, |
| void *data, |
| uint32_t slot) |
| { |
| assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR); |
| |
| struct v3dv_query *q = &pool->queries[query]; |
| uint64_t counter_values[V3D_PERFCNT_NUM]; |
| |
| for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) { |
| struct drm_v3d_perfmon_get_values req = { |
| .id = q->perf.kperfmon_ids[i], |
| .values_ptr = (uintptr_t)(&counter_values[i * |
| DRM_V3D_MAX_PERF_COUNTERS]) |
| }; |
| |
| int ret = v3dv_ioctl(device->pdevice->render_fd, |
| DRM_IOCTL_V3D_PERFMON_GET_VALUES, |
| &req); |
| |
| if (ret) { |
| fprintf(stderr, "failed to get perfmon values: %s\n", strerror(ret)); |
| return vk_error(device, VK_ERROR_DEVICE_LOST); |
| } |
| } |
| |
| for (uint32_t i = 0; i < pool->perfmon.ncounters; i++) |
| write_to_buffer(data, slot + i, do_64bit, counter_values[i]); |
| |
| return VK_SUCCESS; |
| } |
| |
| static VkResult |
| write_query_result(struct v3dv_device *device, |
| struct v3dv_query_pool *pool, |
| uint32_t query, |
| bool do_64bit, |
| void *data, |
| uint32_t slot) |
| { |
| switch (pool->query_type) { |
| case VK_QUERY_TYPE_OCCLUSION: |
| return write_occlusion_query_result(device, pool, query, do_64bit, |
| data, slot); |
| case VK_QUERY_TYPE_TIMESTAMP: |
| return write_timestamp_query_result(device, pool, query, do_64bit, |
| data, slot); |
| case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: |
| return write_performance_query_result(device, pool, query, do_64bit, |
| data, slot); |
| default: |
| unreachable("Unsupported query type"); |
| } |
| } |
| |
| static uint32_t |
| get_query_result_count(struct v3dv_query_pool *pool) |
| { |
| switch (pool->query_type) { |
| case VK_QUERY_TYPE_OCCLUSION: |
| case VK_QUERY_TYPE_TIMESTAMP: |
| return 1; |
| case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: |
| return pool->perfmon.ncounters; |
| default: |
| unreachable("Unsupported query type"); |
| } |
| } |
| |
| VkResult |
| v3dv_get_query_pool_results_cpu(struct v3dv_device *device, |
| struct v3dv_query_pool *pool, |
| uint32_t first, |
| uint32_t count, |
| void *data, |
| VkDeviceSize stride, |
| VkQueryResultFlags flags) |
| { |
| assert(first < pool->query_count); |
| assert(first + count <= pool->query_count); |
| assert(data); |
| |
| const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT || |
| pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR; |
| const bool do_wait = flags & VK_QUERY_RESULT_WAIT_BIT; |
| const bool do_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT; |
| |
| uint32_t result_count = get_query_result_count(pool); |
| |
| VkResult result = VK_SUCCESS; |
| for (uint32_t i = first; i < first + count; i++) { |
| bool available = false; |
| VkResult query_result = |
| query_is_available(device, pool, i, do_wait, &available); |
| if (query_result == VK_ERROR_DEVICE_LOST) |
| result = VK_ERROR_DEVICE_LOST; |
| |
| /** |
| * From the Vulkan 1.0 spec: |
| * |
| * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are |
| * both not set then no result values are written to pData for queries |
| * that are in the unavailable state at the time of the call, and |
| * vkGetQueryPoolResults returns VK_NOT_READY. However, availability |
| * state is still written to pData for those queries if |
| * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set." |
| */ |
| uint32_t slot = 0; |
| |
| const bool write_result = available || do_partial; |
| if (write_result) |
| write_query_result(device, pool, i, do_64bit, data, slot); |
| slot += result_count; |
| |
| if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) |
| write_to_buffer(data, slot++, do_64bit, available ? 1u : 0u); |
| |
| if (!write_result && result != VK_ERROR_DEVICE_LOST) |
| result = VK_NOT_READY; |
| |
| data += stride; |
| } |
| |
| return result; |
| } |
| |
| VKAPI_ATTR VkResult VKAPI_CALL |
| v3dv_GetQueryPoolResults(VkDevice _device, |
| VkQueryPool queryPool, |
| uint32_t firstQuery, |
| uint32_t queryCount, |
| size_t dataSize, |
| void *pData, |
| VkDeviceSize stride, |
| VkQueryResultFlags flags) |
| { |
| V3DV_FROM_HANDLE(v3dv_device, device, _device); |
| V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool); |
| |
| return v3dv_get_query_pool_results_cpu(device, pool, firstQuery, queryCount, |
| pData, stride, flags); |
| } |
| |
| /* Emits a series of vkCmdDispatchBase calls to execute all the workgroups |
| * required to handle a number of queries considering per-dispatch limits. |
| */ |
| static void |
| cmd_buffer_emit_dispatch_queries(struct v3dv_cmd_buffer *cmd_buffer, |
| uint32_t query_count) |
| { |
| VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer); |
| |
| uint32_t dispatched = 0; |
| const uint32_t max_batch_size = 65535; |
| while (dispatched < query_count) { |
| uint32_t batch_size = MIN2(query_count - dispatched, max_batch_size); |
| v3dv_CmdDispatchBase(vk_cmd_buffer, dispatched, 0, 0, batch_size, 1, 1); |
| dispatched += batch_size; |
| } |
| } |
| |
| void |
| v3dv_cmd_buffer_emit_set_query_availability(struct v3dv_cmd_buffer *cmd_buffer, |
| struct v3dv_query_pool *pool, |
| uint32_t query, uint32_t count, |
| uint8_t availability) |
| { |
| assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION || |
| pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR); |
| |
| struct v3dv_device *device = cmd_buffer->device; |
| VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer); |
| |
| /* We are about to emit a compute job to set query availability and we need |
| * to ensure this executes after the graphics work using the queries has |
| * completed. |
| */ |
| VkMemoryBarrier2 barrier = { |
| .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2, |
| .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, |
| .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, |
| }; |
| VkDependencyInfo barrier_info = { |
| .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, |
| .memoryBarrierCount = 1, |
| .pMemoryBarriers = &barrier, |
| }; |
| v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info); |
| |
| /* Dispatch queries */ |
| v3dv_cmd_buffer_meta_state_push(cmd_buffer, true); |
| |
| v3dv_CmdBindPipeline(vk_cmd_buffer, |
| VK_PIPELINE_BIND_POINT_COMPUTE, |
| device->queries.avail_pipeline); |
| |
| v3dv_CmdBindDescriptorSets(vk_cmd_buffer, |
| VK_PIPELINE_BIND_POINT_COMPUTE, |
| device->queries.avail_pipeline_layout, |
| 0, 1, &pool->meta.descriptor_set, |
| 0, NULL); |
| |
| struct { |
| uint32_t offset; |
| uint32_t query; |
| uint8_t availability; |
| } push_data = { pool->occlusion.avail_offset, query, availability }; |
| v3dv_CmdPushConstants(vk_cmd_buffer, |
| device->queries.avail_pipeline_layout, |
| VK_SHADER_STAGE_COMPUTE_BIT, |
| 0, sizeof(push_data), &push_data); |
| cmd_buffer_emit_dispatch_queries(cmd_buffer, count); |
| |
| v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false); |
| } |
| |
| static void |
| cmd_buffer_emit_reset_occlusion_query_pool(struct v3dv_cmd_buffer *cmd_buffer, |
| struct v3dv_query_pool *pool, |
| uint32_t query, uint32_t count) |
| { |
| struct v3dv_device *device = cmd_buffer->device; |
| VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer); |
| |
| /* Ensure the GPU is done with the queries in the graphics queue before |
| * we reset in the compute queue. |
| */ |
| VkMemoryBarrier2 barrier = { |
| .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2, |
| .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, |
| .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, |
| }; |
| VkDependencyInfo barrier_info = { |
| .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, |
| .memoryBarrierCount = 1, |
| .pMemoryBarriers = &barrier, |
| }; |
| v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info); |
| |
| /* Emit compute reset */ |
| v3dv_cmd_buffer_meta_state_push(cmd_buffer, true); |
| |
| v3dv_CmdBindPipeline(vk_cmd_buffer, |
| VK_PIPELINE_BIND_POINT_COMPUTE, |
| device->queries.reset_occlusion_pipeline); |
| |
| v3dv_CmdBindDescriptorSets(vk_cmd_buffer, |
| VK_PIPELINE_BIND_POINT_COMPUTE, |
| device->queries.reset_occlusion_pipeline_layout, |
| 0, 1, &pool->meta.descriptor_set, |
| 0, NULL); |
| struct { |
| uint32_t offset; |
| uint32_t query; |
| } push_data = { pool->occlusion.avail_offset, query }; |
| v3dv_CmdPushConstants(vk_cmd_buffer, |
| device->queries.reset_occlusion_pipeline_layout, |
| VK_SHADER_STAGE_COMPUTE_BIT, |
| 0, sizeof(push_data), &push_data); |
| |
| cmd_buffer_emit_dispatch_queries(cmd_buffer, count); |
| |
| v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false); |
| |
| /* Ensure future work in the graphics queue using the queries doesn't start |
| * before the reset completed. |
| */ |
| barrier = (VkMemoryBarrier2) { |
| .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2, |
| .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, |
| .dstStageMask = VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT, |
| }; |
| barrier_info = (VkDependencyInfo) { |
| .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, |
| .memoryBarrierCount = 1, |
| .pMemoryBarriers = &barrier, |
| }; |
| v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info); |
| } |
| |
| static void |
| cmd_buffer_emit_reset_query_pool(struct v3dv_cmd_buffer *cmd_buffer, |
| struct v3dv_query_pool *pool, |
| uint32_t first, uint32_t count) |
| { |
| assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION); |
| cmd_buffer_emit_reset_occlusion_query_pool(cmd_buffer, pool, first, count); |
| } |
| |
| static void |
| cmd_buffer_emit_reset_query_pool_cpu(struct v3dv_cmd_buffer *cmd_buffer, |
| struct v3dv_query_pool *pool, |
| uint32_t first, uint32_t count) |
| { |
| assert(pool->query_type != VK_QUERY_TYPE_OCCLUSION); |
| |
| struct v3dv_job *job = |
| v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, |
| V3DV_JOB_TYPE_CPU_RESET_QUERIES, |
| cmd_buffer, -1); |
| v3dv_return_if_oom(cmd_buffer, NULL); |
| job->cpu.query_reset.pool = pool; |
| job->cpu.query_reset.first = first; |
| job->cpu.query_reset.count = count; |
| list_addtail(&job->list_link, &cmd_buffer->jobs); |
| } |
| |
| VKAPI_ATTR void VKAPI_CALL |
| v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer, |
| VkQueryPool queryPool, |
| uint32_t firstQuery, |
| uint32_t queryCount) |
| { |
| V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); |
| V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool); |
| |
| /* Resets can only happen outside a render pass instance so we should not |
| * be in the middle of job recording. |
| */ |
| assert(cmd_buffer->state.pass == NULL); |
| assert(cmd_buffer->state.job == NULL); |
| |
| assert(firstQuery < pool->query_count); |
| assert(firstQuery + queryCount <= pool->query_count); |
| |
| /* We can reset occlusion queries in the GPU, but for other query types |
| * we emit a CPU job that will call v3dv_reset_query_pool_cpu when executed |
| * in the queue. |
| */ |
| if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) { |
| cmd_buffer_emit_reset_query_pool(cmd_buffer, pool, firstQuery, queryCount); |
| } else { |
| cmd_buffer_emit_reset_query_pool_cpu(cmd_buffer, pool, |
| firstQuery, queryCount); |
| } |
| } |
| |
| /** |
| * Creates a descriptor pool so we can create a descriptors for the destination |
| * buffers of vkCmdCopyQueryResults for queries where this is implemented in |
| * the GPU. |
| */ |
| static VkResult |
| create_storage_buffer_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer) |
| { |
| /* If this is not the first pool we create one for this command buffer |
| * size it based on the size of the currently exhausted pool. |
| */ |
| uint32_t descriptor_count = 32; |
| if (cmd_buffer->meta.query.dspool != VK_NULL_HANDLE) { |
| struct v3dv_descriptor_pool *exhausted_pool = |
| v3dv_descriptor_pool_from_handle(cmd_buffer->meta.query.dspool); |
| descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024); |
| } |
| |
| /* Create the descriptor pool */ |
| cmd_buffer->meta.query.dspool = VK_NULL_HANDLE; |
| VkDescriptorPoolSize pool_size = { |
| .type = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, |
| .descriptorCount = descriptor_count, |
| }; |
| VkDescriptorPoolCreateInfo info = { |
| .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, |
| .maxSets = descriptor_count, |
| .poolSizeCount = 1, |
| .pPoolSizes = &pool_size, |
| .flags = 0, |
| }; |
| VkResult result = |
| v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device), |
| &info, |
| &cmd_buffer->device->vk.alloc, |
| &cmd_buffer->meta.query.dspool); |
| |
| if (result == VK_SUCCESS) { |
| assert(cmd_buffer->meta.query.dspool != VK_NULL_HANDLE); |
| const VkDescriptorPool vk_pool = cmd_buffer->meta.query.dspool; |
| |
| v3dv_cmd_buffer_add_private_obj( |
| cmd_buffer, (uintptr_t) vk_pool, |
| (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool); |
| |
| struct v3dv_descriptor_pool *pool = |
| v3dv_descriptor_pool_from_handle(vk_pool); |
| pool->is_driver_internal = true; |
| } |
| |
| return result; |
| } |
| |
| static VkResult |
| allocate_storage_buffer_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer, |
| VkDescriptorSet *set) |
| { |
| /* Make sure we have a descriptor pool */ |
| VkResult result; |
| if (cmd_buffer->meta.query.dspool == VK_NULL_HANDLE) { |
| result = create_storage_buffer_descriptor_pool(cmd_buffer); |
| if (result != VK_SUCCESS) |
| return result; |
| } |
| assert(cmd_buffer->meta.query.dspool != VK_NULL_HANDLE); |
| |
| /* Allocate descriptor set */ |
| struct v3dv_device *device = cmd_buffer->device; |
| VkDevice vk_device = v3dv_device_to_handle(device); |
| VkDescriptorSetAllocateInfo info = { |
| .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, |
| .descriptorPool = cmd_buffer->meta.query.dspool, |
| .descriptorSetCount = 1, |
| .pSetLayouts = &device->queries.buf_descriptor_set_layout, |
| }; |
| result = v3dv_AllocateDescriptorSets(vk_device, &info, set); |
| |
| /* If we ran out of pool space, grow the pool and try again */ |
| if (result == VK_ERROR_OUT_OF_POOL_MEMORY) { |
| result = create_storage_buffer_descriptor_pool(cmd_buffer); |
| if (result == VK_SUCCESS) { |
| info.descriptorPool = cmd_buffer->meta.query.dspool; |
| result = v3dv_AllocateDescriptorSets(vk_device, &info, set); |
| } |
| } |
| |
| return result; |
| } |
| |
| static uint32_t |
| copy_pipeline_index_from_flags(VkQueryResultFlags flags) |
| { |
| uint32_t index = 0; |
| if (flags & VK_QUERY_RESULT_64_BIT) |
| index |= 1; |
| if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) |
| index |= 2; |
| if (flags & VK_QUERY_RESULT_PARTIAL_BIT) |
| index |= 4; |
| assert(index < 8); |
| return index; |
| } |
| |
| static nir_shader * |
| get_copy_query_results_cs(VkQueryResultFlags flags); |
| |
| static void |
| cmd_buffer_emit_copy_query_pool_results(struct v3dv_cmd_buffer *cmd_buffer, |
| struct v3dv_query_pool *pool, |
| uint32_t first, uint32_t count, |
| struct v3dv_buffer *buf, |
| uint32_t offset, uint32_t stride, |
| VkQueryResultFlags flags) |
| { |
| struct v3dv_device *device = cmd_buffer->device; |
| VkDevice vk_device = v3dv_device_to_handle(device); |
| VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer); |
| |
| /* Create the required copy pipeline if not yet created */ |
| uint32_t pipeline_idx = copy_pipeline_index_from_flags(flags); |
| if (!device->queries.copy_pipeline[pipeline_idx]) { |
| nir_shader *copy_query_results_cs_nir = get_copy_query_results_cs(flags); |
| VkResult result = |
| v3dv_create_compute_pipeline_from_nir( |
| device, copy_query_results_cs_nir, |
| device->queries.copy_pipeline_layout, |
| &device->queries.copy_pipeline[pipeline_idx]); |
| ralloc_free(copy_query_results_cs_nir); |
| if (result != VK_SUCCESS) { |
| fprintf(stderr, "Failed to create copy query results pipeline\n"); |
| return; |
| } |
| } |
| |
| /* FIXME: do we need this barrier? Since vkCmdEndQuery should've been called |
| * and that already waits maybe we don't (since this is serialized |
| * in the compute queue with EndQuery anyway). |
| */ |
| if (flags & VK_QUERY_RESULT_WAIT_BIT) { |
| VkMemoryBarrier2 barrier = { |
| .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2, |
| .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, |
| .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, |
| }; |
| VkDependencyInfo barrier_info = { |
| .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, |
| .memoryBarrierCount = 1, |
| .pMemoryBarriers = &barrier, |
| }; |
| v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info); |
| } |
| |
| /* Allocate and setup descriptor set for output buffer */ |
| VkDescriptorSet out_buf_descriptor_set; |
| VkResult result = |
| allocate_storage_buffer_descriptor_set(cmd_buffer, |
| &out_buf_descriptor_set); |
| if (result != VK_SUCCESS) { |
| fprintf(stderr, "vkCmdCopyQueryPoolResults failed: " |
| "could not allocate descriptor.\n"); |
| return; |
| } |
| |
| VkDescriptorBufferInfo desc_buf_info = { |
| .buffer = v3dv_buffer_to_handle(buf), |
| .offset = 0, |
| .range = VK_WHOLE_SIZE, |
| }; |
| VkWriteDescriptorSet write = { |
| .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, |
| .dstSet = out_buf_descriptor_set, |
| .dstBinding = 0, |
| .dstArrayElement = 0, |
| .descriptorCount = 1, |
| .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |
| .pBufferInfo = &desc_buf_info, |
| }; |
| v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL); |
| |
| /* Dispatch copy */ |
| v3dv_cmd_buffer_meta_state_push(cmd_buffer, true); |
| |
| assert(device->queries.copy_pipeline[pipeline_idx]); |
| v3dv_CmdBindPipeline(vk_cmd_buffer, |
| VK_PIPELINE_BIND_POINT_COMPUTE, |
| device->queries.copy_pipeline[pipeline_idx]); |
| |
| VkDescriptorSet sets[2] = { |
| pool->meta.descriptor_set, |
| out_buf_descriptor_set, |
| }; |
| v3dv_CmdBindDescriptorSets(vk_cmd_buffer, |
| VK_PIPELINE_BIND_POINT_COMPUTE, |
| device->queries.copy_pipeline_layout, |
| 0, 2, sets, 0, NULL); |
| |
| struct { |
| uint32_t avail_offset, first, offset, stride, flags; |
| } push_data = { pool->occlusion.avail_offset, first, offset, stride, flags }; |
| v3dv_CmdPushConstants(vk_cmd_buffer, |
| device->queries.copy_pipeline_layout, |
| VK_SHADER_STAGE_COMPUTE_BIT, |
| 0, sizeof(push_data), &push_data); |
| |
| cmd_buffer_emit_dispatch_queries(cmd_buffer, count); |
| |
| v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false); |
| } |
| |
| static void |
| cmd_buffer_emit_copy_query_pool_results_cpu(struct v3dv_cmd_buffer *cmd_buffer, |
| struct v3dv_query_pool *pool, |
| uint32_t first, |
| uint32_t count, |
| struct v3dv_buffer *dst, |
| uint32_t offset, |
| uint32_t stride, |
| VkQueryResultFlags flags) |
| { |
| struct v3dv_job *job = |
| v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, |
| V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS, |
| cmd_buffer, -1); |
| v3dv_return_if_oom(cmd_buffer, NULL); |
| |
| job->cpu.query_copy_results.pool = pool; |
| job->cpu.query_copy_results.first = first; |
| job->cpu.query_copy_results.count = count; |
| job->cpu.query_copy_results.dst = dst; |
| job->cpu.query_copy_results.offset = offset; |
| job->cpu.query_copy_results.stride = stride; |
| job->cpu.query_copy_results.flags = flags; |
| |
| list_addtail(&job->list_link, &cmd_buffer->jobs); |
| } |
| |
| VKAPI_ATTR void VKAPI_CALL |
| v3dv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, |
| VkQueryPool queryPool, |
| uint32_t firstQuery, |
| uint32_t queryCount, |
| VkBuffer dstBuffer, |
| VkDeviceSize dstOffset, |
| VkDeviceSize stride, |
| VkQueryResultFlags flags) |
| { |
| V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); |
| V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool); |
| V3DV_FROM_HANDLE(v3dv_buffer, dst, dstBuffer); |
| |
| /* Copies can only happen outside a render pass instance so we should not |
| * be in the middle of job recording. |
| */ |
| assert(cmd_buffer->state.pass == NULL); |
| assert(cmd_buffer->state.job == NULL); |
| |
| assert(firstQuery < pool->query_count); |
| assert(firstQuery + queryCount <= pool->query_count); |
| |
| /* For occlusion queries we implement the copy in the GPU but for other |
| * queries we emit a CPU job that will call v3dv_get_query_pool_results_cpu |
| * when executed in the queue. |
| */ |
| if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) { |
| cmd_buffer_emit_copy_query_pool_results(cmd_buffer, pool, |
| firstQuery, queryCount, |
| dst, (uint32_t) dstOffset, |
| (uint32_t) stride, flags); |
| } else { |
| cmd_buffer_emit_copy_query_pool_results_cpu(cmd_buffer, pool, |
| firstQuery, queryCount, |
| dst, (uint32_t)dstOffset, |
| (uint32_t) stride, flags); |
| } |
| } |
| |
| VKAPI_ATTR void VKAPI_CALL |
| v3dv_CmdBeginQuery(VkCommandBuffer commandBuffer, |
| VkQueryPool queryPool, |
| uint32_t query, |
| VkQueryControlFlags flags) |
| { |
| V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); |
| V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool); |
| |
| v3dv_cmd_buffer_begin_query(cmd_buffer, pool, query, flags); |
| } |
| |
| VKAPI_ATTR void VKAPI_CALL |
| v3dv_CmdEndQuery(VkCommandBuffer commandBuffer, |
| VkQueryPool queryPool, |
| uint32_t query) |
| { |
| V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); |
| V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool); |
| |
| v3dv_cmd_buffer_end_query(cmd_buffer, pool, query); |
| } |
| |
| void |
| v3dv_reset_query_pool_cpu(struct v3dv_device *device, |
| struct v3dv_query_pool *pool, |
| uint32_t first, |
| uint32_t count) |
| { |
| mtx_lock(&device->query_mutex); |
| |
| for (uint32_t i = first; i < first + count; i++) { |
| assert(i < pool->query_count); |
| struct v3dv_query *q = &pool->queries[i]; |
| q->maybe_available = false; |
| switch (pool->query_type) { |
| case VK_QUERY_TYPE_OCCLUSION: { |
| /* Reset availability */ |
| uint8_t *base_addr = ((uint8_t *) pool->occlusion.bo->map) + |
| pool->occlusion.avail_offset + first; |
| memset(base_addr, 0, count); |
| |
| /* Reset occlusion counter */ |
| const uint8_t *q_addr = |
| ((uint8_t *) pool->occlusion.bo->map) + q->occlusion.offset; |
| uint32_t *counter = (uint32_t *) q_addr; |
| *counter = 0; |
| break; |
| } |
| case VK_QUERY_TYPE_TIMESTAMP: |
| q->value = 0; |
| break; |
| case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: |
| kperfmon_destroy(device, pool, i); |
| kperfmon_create(device, pool, i); |
| if (vk_sync_reset(&device->vk, q->perf.last_job_sync) != VK_SUCCESS) |
| fprintf(stderr, "Failed to reset sync"); |
| break; |
| default: |
| unreachable("Unsupported query type"); |
| } |
| } |
| |
| mtx_unlock(&device->query_mutex); |
| } |
| |
| VKAPI_ATTR void VKAPI_CALL |
| v3dv_ResetQueryPool(VkDevice _device, |
| VkQueryPool queryPool, |
| uint32_t firstQuery, |
| uint32_t queryCount) |
| { |
| V3DV_FROM_HANDLE(v3dv_device, device, _device); |
| V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool); |
| |
| v3dv_reset_query_pool_cpu(device, pool, firstQuery, queryCount); |
| } |
| |
| VKAPI_ATTR VkResult VKAPI_CALL |
| v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( |
| VkPhysicalDevice physicalDevice, |
| uint32_t queueFamilyIndex, |
| uint32_t *pCounterCount, |
| VkPerformanceCounterKHR *pCounters, |
| VkPerformanceCounterDescriptionKHR *pCounterDescriptions) |
| { |
| uint32_t desc_count = *pCounterCount; |
| |
| VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, |
| out, pCounters, pCounterCount); |
| VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, |
| out_desc, pCounterDescriptions, &desc_count); |
| |
| for (int i = 0; i < ARRAY_SIZE(v3dv_counters); i++) { |
| vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) { |
| counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR; |
| counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR; |
| counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR; |
| |
| unsigned char sha1_result[20]; |
| _mesa_sha1_compute(v3dv_counters[i][1], strlen(v3dv_counters[i][1]), |
| sha1_result); |
| |
| memcpy(counter->uuid, sha1_result, sizeof(counter->uuid)); |
| } |
| |
| vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, |
| &out_desc, desc) { |
| desc->flags = 0; |
| snprintf(desc->name, sizeof(desc->name), "%s", |
| v3dv_counters[i][1]); |
| snprintf(desc->category, sizeof(desc->category), "%s", |
| v3dv_counters[i][0]); |
| snprintf(desc->description, sizeof(desc->description), "%s", |
| v3dv_counters[i][2]); |
| } |
| } |
| |
| return vk_outarray_status(&out); |
| } |
| |
| VKAPI_ATTR void VKAPI_CALL |
| v3dv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR( |
| VkPhysicalDevice physicalDevice, |
| const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo, |
| uint32_t *pNumPasses) |
| { |
| *pNumPasses = DIV_ROUND_UP(pPerformanceQueryCreateInfo->counterIndexCount, |
| DRM_V3D_MAX_PERF_COUNTERS); |
| } |
| |
| VKAPI_ATTR VkResult VKAPI_CALL |
| v3dv_AcquireProfilingLockKHR( |
| VkDevice _device, |
| const VkAcquireProfilingLockInfoKHR *pInfo) |
| { |
| return VK_SUCCESS; |
| } |
| |
| VKAPI_ATTR void VKAPI_CALL |
| v3dv_ReleaseProfilingLockKHR(VkDevice device) |
| { |
| } |
| |
| static inline void |
| nir_set_query_availability(nir_builder *b, |
| nir_ssa_def *buf, |
| nir_ssa_def *offset, |
| nir_ssa_def *query_idx, |
| nir_ssa_def *avail) |
| { |
| offset = nir_iadd(b, offset, query_idx); /* we use 1B per query */ |
| nir_store_ssbo(b, avail, buf, offset, .write_mask = 0x1, .align_mul = 1); |
| } |
| |
| static inline nir_ssa_def * |
| nir_get_query_availability(nir_builder *b, |
| nir_ssa_def *buf, |
| nir_ssa_def *offset, |
| nir_ssa_def *query_idx) |
| { |
| offset = nir_iadd(b, offset, query_idx); /* we use 1B per query */ |
| nir_ssa_def *avail = nir_load_ssbo(b, 1, 8, buf, offset, .align_mul = 1); |
| return nir_i2i32(b, avail); |
| } |
| |
| static nir_shader * |
| get_set_query_availability_cs() |
| { |
| const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options(); |
| nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, |
| "set query availability cs"); |
| |
| /* We rely on supergroup packing to maximize SIMD lane occupancy */ |
| b.shader->info.workgroup_size[0] = 1; |
| b.shader->info.workgroup_size[1] = 1; |
| b.shader->info.workgroup_size[2] = 1; |
| |
| nir_ssa_def *buf = |
| nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0), |
| .desc_set = 0, |
| .binding = 0, |
| .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); |
| |
| /* This assumes a local size of 1 and a horizontal-only dispatch. If we |
| * ever change any of these parameters we need to update how we compute the |
| * query index here. |
| */ |
| nir_ssa_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b, 32), 0); |
| |
| nir_ssa_def *offset = |
| nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4); |
| |
| nir_ssa_def *query_idx = |
| nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4); |
| |
| nir_ssa_def *avail = |
| nir_load_push_constant(&b, 1, 8, nir_imm_int(&b, 0), .base = 8, .range = 1); |
| |
| query_idx = nir_iadd(&b, query_idx, wg_id); |
| nir_set_query_availability(&b, buf, offset, query_idx, avail); |
| |
| return b.shader; |
| } |
| |
| static inline nir_ssa_def * |
| nir_get_occlusion_counter_offset(nir_builder *b, nir_ssa_def *query_idx) |
| { |
| nir_ssa_def *query_group = nir_udiv(b, query_idx, nir_imm_int(b, 16)); |
| nir_ssa_def *query_group_offset = nir_umod(b, query_idx, nir_imm_int(b, 16)); |
| nir_ssa_def *offset = |
| nir_iadd(b, nir_imul(b, query_group, nir_imm_int(b, 1024)), |
| nir_imul(b, query_group_offset, nir_imm_int(b, 4))); |
| return offset; |
| } |
| |
| static inline void |
| nir_reset_occlusion_counter(nir_builder *b, |
| nir_ssa_def *buf, |
| nir_ssa_def *query_idx) |
| { |
| nir_ssa_def *offset = nir_get_occlusion_counter_offset(b, query_idx); |
| nir_ssa_def *zero = nir_imm_int(b, 0); |
| nir_store_ssbo(b, zero, buf, offset, .write_mask = 0x1, .align_mul = 4); |
| } |
| |
| static inline nir_ssa_def * |
| nir_read_occlusion_counter(nir_builder *b, |
| nir_ssa_def *buf, |
| nir_ssa_def *query_idx) |
| { |
| nir_ssa_def *offset = nir_get_occlusion_counter_offset(b, query_idx); |
| return nir_load_ssbo(b, 1, 32, buf, offset, .access = 0, .align_mul = 4); |
| } |
| |
| static nir_shader * |
| get_reset_occlusion_query_cs() |
| { |
| const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options(); |
| nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, |
| "reset occlusion query cs"); |
| |
| /* We rely on supergroup packing to maximize SIMD lane occupancy */ |
| b.shader->info.workgroup_size[0] = 1; |
| b.shader->info.workgroup_size[1] = 1; |
| b.shader->info.workgroup_size[2] = 1; |
| |
| nir_ssa_def *buf = |
| nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0), |
| .desc_set = 0, |
| .binding = 0, |
| .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); |
| |
| /* This assumes a local size of 1 and a horizontal-only dispatch. If we |
| * ever change any of these parameters we need to update how we compute the |
| * query index here. |
| */ |
| nir_ssa_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b, 32), 0); |
| |
| nir_ssa_def *avail_offset = |
| nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4); |
| |
| nir_ssa_def *base_query_idx = |
| nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4); |
| |
| nir_ssa_def *query_idx = nir_iadd(&b, base_query_idx, wg_id); |
| |
| nir_set_query_availability(&b, buf, avail_offset, query_idx, |
| nir_imm_intN_t(&b, 0, 8)); |
| nir_reset_occlusion_counter(&b, buf, query_idx); |
| |
| return b.shader; |
| } |
| |
| static void |
| write_query_buffer(nir_builder *b, |
| nir_ssa_def *buf, |
| nir_ssa_def **offset, |
| nir_ssa_def *value, |
| bool flag_64bit) |
| { |
| if (flag_64bit) { |
| /* Create a 64-bit value using a vec2 with the .Y component set to 0 |
| * so we can write a 64-bit value in a single store. |
| */ |
| nir_ssa_def *value64 = nir_vec2(b, value, nir_imm_int(b, 0)); |
| nir_store_ssbo(b, value64, buf, *offset, .write_mask = 0x3, .align_mul = 8); |
| *offset = nir_iadd(b, *offset, nir_imm_int(b, 8)); |
| } else { |
| nir_store_ssbo(b, value, buf, *offset, .write_mask = 0x1, .align_mul = 4); |
| *offset = nir_iadd(b, *offset, nir_imm_int(b, 4)); |
| } |
| } |
| |
| static nir_shader * |
| get_copy_query_results_cs(VkQueryResultFlags flags) |
| { |
| bool flag_64bit = flags & VK_QUERY_RESULT_64_BIT; |
| bool flag_avail = flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT; |
| bool flag_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT; |
| |
| const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options(); |
| nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, |
| "copy query results cs"); |
| |
| /* We rely on supergroup packing to maximize SIMD lane occupancy */ |
| b.shader->info.workgroup_size[0] = 1; |
| b.shader->info.workgroup_size[1] = 1; |
| b.shader->info.workgroup_size[2] = 1; |
| |
| nir_ssa_def *buf = |
| nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0), |
| .desc_set = 0, |
| .binding = 0, |
| .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); |
| |
| nir_ssa_def *buf_out = |
| nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0), |
| .desc_set = 1, |
| .binding = 0, |
| .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); |
| |
| /* Read push constants */ |
| nir_ssa_def *avail_offset = |
| nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4); |
| |
| nir_ssa_def *base_query_idx = |
| nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4); |
| |
| nir_ssa_def *base_offset_out = |
| nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 8, .range = 4); |
| |
| nir_ssa_def *stride = |
| nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 12, .range = 4); |
| |
| /* This assumes a local size of 1 and a horizontal-only dispatch. If we |
| * ever change any of these parameters we need to update how we compute the |
| * query index here. |
| */ |
| nir_ssa_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b, 32), 0); |
| nir_ssa_def *query_idx = nir_iadd(&b, base_query_idx, wg_id); |
| |
| /* Read query availability if needed */ |
| nir_ssa_def *avail = NULL; |
| if (flag_avail || !flag_partial) |
| avail = nir_get_query_availability(&b, buf, avail_offset, query_idx); |
| |
| /* Write occusion query result... */ |
| nir_ssa_def *offset = |
| nir_iadd(&b, base_offset_out, nir_imul(&b, wg_id, stride)); |
| |
| /* ...if partial is requested, we always write */ |
| if(flag_partial) { |
| nir_ssa_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx); |
| write_query_buffer(&b, buf_out, &offset, query_res, flag_64bit); |
| } else { |
| /*...otherwise, we only write if the query is available */ |
| nir_if *if_stmt = nir_push_if(&b, nir_ine_imm(&b, avail, 0)); |
| nir_ssa_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx); |
| write_query_buffer(&b, buf_out, &offset, query_res, flag_64bit); |
| nir_pop_if(&b, if_stmt); |
| } |
| |
| /* Write query availability */ |
| if (flag_avail) |
| write_query_buffer(&b, buf_out, &offset, avail, flag_64bit); |
| |
| return b.shader; |
| } |
| |
| static bool |
| create_query_pipelines(struct v3dv_device *device) |
| { |
| VkResult result; |
| VkPipeline pipeline; |
| |
| /* Set layout: single storage buffer */ |
| if (!device->queries.buf_descriptor_set_layout) { |
| VkDescriptorSetLayoutBinding descriptor_set_layout_binding = { |
| .binding = 0, |
| .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |
| .descriptorCount = 1, |
| .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, |
| }; |
| VkDescriptorSetLayoutCreateInfo descriptor_set_layout_info = { |
| .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, |
| .bindingCount = 1, |
| .pBindings = &descriptor_set_layout_binding, |
| }; |
| result = |
| v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device), |
| &descriptor_set_layout_info, |
| &device->vk.alloc, |
| &device->queries.buf_descriptor_set_layout); |
| if (result != VK_SUCCESS) |
| return false; |
| } |
| |
| /* Set availability pipeline. |
| * |
| * Pipeline layout: |
| * - 1 storage buffer for the BO with the query availability. |
| * - 2 push constants: |
| * 0B: offset of the availability info in the buffer (4 bytes) |
| * 4B: base query index (4 bytes). |
| * 8B: availability (1 byte). |
| */ |
| if (!device->queries.avail_pipeline_layout) { |
| VkPipelineLayoutCreateInfo pipeline_layout_info = { |
| .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, |
| .setLayoutCount = 1, |
| .pSetLayouts = &device->queries.buf_descriptor_set_layout, |
| .pushConstantRangeCount = 1, |
| .pPushConstantRanges = |
| &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 9 }, |
| }; |
| |
| result = |
| v3dv_CreatePipelineLayout(v3dv_device_to_handle(device), |
| &pipeline_layout_info, |
| &device->vk.alloc, |
| &device->queries.avail_pipeline_layout); |
| |
| if (result != VK_SUCCESS) |
| return false; |
| } |
| |
| if (!device->queries.avail_pipeline) { |
| nir_shader *set_query_availability_cs_nir = get_set_query_availability_cs(); |
| result = v3dv_create_compute_pipeline_from_nir(device, |
| set_query_availability_cs_nir, |
| device->queries.avail_pipeline_layout, |
| &pipeline); |
| ralloc_free(set_query_availability_cs_nir); |
| if (result != VK_SUCCESS) |
| return false; |
| |
| device->queries.avail_pipeline = pipeline; |
| } |
| |
| /* Reset occlusion query pipeline. |
| * |
| * Pipeline layout: |
| * - 1 storage buffer for the BO with the occlusion and availability data. |
| * - Push constants: |
| * 0B: offset of the availability info in the buffer (4B) |
| * 4B: base query index (4B) |
| */ |
| if (!device->queries.reset_occlusion_pipeline_layout) { |
| VkPipelineLayoutCreateInfo pipeline_layout_info = { |
| .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, |
| .setLayoutCount = 1, |
| .pSetLayouts = &device->queries.buf_descriptor_set_layout, |
| .pushConstantRangeCount = 1, |
| .pPushConstantRanges = |
| &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 8 }, |
| }; |
| |
| result = |
| v3dv_CreatePipelineLayout(v3dv_device_to_handle(device), |
| &pipeline_layout_info, |
| &device->vk.alloc, |
| &device->queries.reset_occlusion_pipeline_layout); |
| |
| if (result != VK_SUCCESS) |
| return false; |
| } |
| |
| if (!device->queries.reset_occlusion_pipeline) { |
| nir_shader *reset_occlusion_query_cs_nir = get_reset_occlusion_query_cs(); |
| result = v3dv_create_compute_pipeline_from_nir( |
| device, |
| reset_occlusion_query_cs_nir, |
| device->queries.reset_occlusion_pipeline_layout, |
| &pipeline); |
| ralloc_free(reset_occlusion_query_cs_nir); |
| if (result != VK_SUCCESS) |
| return false; |
| |
| device->queries.reset_occlusion_pipeline = pipeline; |
| } |
| |
| /* Copy query results pipelines. |
| * |
| * Pipeline layout: |
| * - 1 storage buffer for the BO with the query availability and occlusion. |
| * - 1 storage buffer for the output. |
| * - Push constants: |
| * 0B: offset of the availability info in the buffer (4B) |
| * 4B: base query index (4B) |
| * 8B: offset into output buffer (4B) |
| * 12B: stride (4B) |
| * |
| * We create multiple specialized pipelines depending on the copy flags |
| * to remove conditionals from the copy shader and get more optimized |
| * pipelines. |
| */ |
| if (!device->queries.copy_pipeline_layout) { |
| VkDescriptorSetLayout set_layouts[2] = { |
| device->queries.buf_descriptor_set_layout, |
| device->queries.buf_descriptor_set_layout |
| }; |
| VkPipelineLayoutCreateInfo pipeline_layout_info = { |
| .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, |
| .setLayoutCount = 2, |
| .pSetLayouts = set_layouts, |
| .pushConstantRangeCount = 1, |
| .pPushConstantRanges = |
| &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 16 }, |
| }; |
| |
| result = |
| v3dv_CreatePipelineLayout(v3dv_device_to_handle(device), |
| &pipeline_layout_info, |
| &device->vk.alloc, |
| &device->queries.copy_pipeline_layout); |
| |
| if (result != VK_SUCCESS) |
| return false; |
| } |
| |
| /* Actual copy pipelines are created lazily on demand since there can be up |
| * to 8 depending on the flags used, however it is likely that applications |
| * will use the same flags every time and only one pipeline is required. |
| */ |
| |
| return true; |
| } |
| |
| static void |
| destroy_query_pipelines(struct v3dv_device *device) |
| { |
| VkDevice _device = v3dv_device_to_handle(device); |
| |
| /* Availability pipeline */ |
| v3dv_DestroyPipeline(_device, device->queries.avail_pipeline, |
| &device->vk.alloc); |
| device->queries.avail_pipeline = VK_NULL_HANDLE; |
| v3dv_DestroyPipelineLayout(_device, device->queries.avail_pipeline_layout, |
| &device->vk.alloc); |
| device->queries.avail_pipeline_layout = VK_NULL_HANDLE; |
| |
| /* Reset occlusion pipeline */ |
| v3dv_DestroyPipeline(_device, device->queries.reset_occlusion_pipeline, |
| &device->vk.alloc); |
| device->queries.reset_occlusion_pipeline = VK_NULL_HANDLE; |
| v3dv_DestroyPipelineLayout(_device, |
| device->queries.reset_occlusion_pipeline_layout, |
| &device->vk.alloc); |
| device->queries.reset_occlusion_pipeline_layout = VK_NULL_HANDLE; |
| |
| /* Copy pipelines */ |
| for (int i = 0; i < 8; i++) { |
| v3dv_DestroyPipeline(_device, device->queries.copy_pipeline[i], |
| &device->vk.alloc); |
| device->queries.copy_pipeline[i] = VK_NULL_HANDLE; |
| } |
| v3dv_DestroyPipelineLayout(_device, device->queries.copy_pipeline_layout, |
| &device->vk.alloc); |
| device->queries.copy_pipeline_layout = VK_NULL_HANDLE; |
| |
| v3dv_DestroyDescriptorSetLayout(_device, |
| device->queries.buf_descriptor_set_layout, |
| &device->vk.alloc); |
| device->queries.buf_descriptor_set_layout = VK_NULL_HANDLE; |
| } |
| |
| /** |
| * Allocates device resources for implementing certain types of queries. |
| */ |
| VkResult |
| v3dv_query_allocate_resources(struct v3dv_device *device) |
| { |
| if (!create_query_pipelines(device)) |
| return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); |
| |
| return VK_SUCCESS; |
| } |
| |
| void |
| v3dv_query_free_resources(struct v3dv_device *device) |
| { |
| destroy_query_pipelines(device); |
| } |