| /* |
| * Copyright © 2021 Valve Corporation |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| */ |
| |
| #include <inttypes.h> |
| |
| #include "ac_perfcounter.h" |
| #include "amdgfxregs.h" |
| #include "radv_cs.h" |
| #include "radv_private.h" |
| #include "sid.h" |
| |
| void |
| radv_perfcounter_emit_shaders(struct radeon_cmdbuf *cs, unsigned shaders) |
| { |
| radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2); |
| radeon_emit(cs, shaders & 0x7f); |
| radeon_emit(cs, 0xffffffff); |
| } |
| |
| static void |
| radv_emit_windowed_counters(struct radv_device *device, struct radeon_cmdbuf *cs, int family, |
| bool enable) |
| { |
| if (family == RADV_QUEUE_GENERAL) { |
| radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); |
| radeon_emit(cs, EVENT_TYPE(enable ? V_028A90_PERFCOUNTER_START : V_028A90_PERFCOUNTER_STOP) | |
| EVENT_INDEX(0)); |
| } |
| |
| radeon_set_sh_reg(cs, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(enable)); |
| } |
| |
| void |
| radv_perfcounter_emit_spm_reset(struct radeon_cmdbuf *cs) |
| { |
| radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, |
| S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) | |
| S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_DISABLE_AND_RESET)); |
| } |
| |
| void |
| radv_perfcounter_emit_spm_start(struct radv_device *device, struct radeon_cmdbuf *cs, int family) |
| { |
| /* Start SPM counters. */ |
| radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, |
| S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) | |
| S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_START_COUNTING)); |
| |
| radv_emit_windowed_counters(device, cs, family, true); |
| } |
| |
| void |
| radv_perfcounter_emit_spm_stop(struct radv_device *device, struct radeon_cmdbuf *cs, int family) |
| { |
| radv_emit_windowed_counters(device, cs, family, false); |
| |
| /* Stop SPM counters. */ |
| radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, |
| S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) | |
| S_036020_SPM_PERFMON_STATE(device->physical_device->rad_info.never_stop_sq_perf_counters ? |
| V_036020_STRM_PERFMON_STATE_START_COUNTING : |
| V_036020_STRM_PERFMON_STATE_STOP_COUNTING)); |
| } |
| |
| enum radv_perfcounter_op { |
| RADV_PC_OP_SUM, |
| RADV_PC_OP_MAX, |
| RADV_PC_OP_RATIO_DIVSCALE, |
| RADV_PC_OP_REVERSE_RATIO, /* (reg1 - reg0) / reg1 */ |
| RADV_PC_OP_SUM_WEIGHTED_4, |
| }; |
| |
| #define S_REG_SEL(x) ((x)&0xFFFF) |
| #define G_REG_SEL(x) ((x)&0xFFFF) |
| #define S_REG_BLOCK(x) ((x) << 16) |
| #define G_REG_BLOCK(x) (((x) >> 16) & 0x7FFF) |
| |
| #define S_REG_OFFSET(x) ((x)&0xFFFF) |
| #define G_REG_OFFSET(x) ((x)&0xFFFF) |
| #define S_REG_INSTANCES(x) ((x) << 16) |
| #define G_REG_INSTANCES(x) (((x) >> 16) & 0x7FFF) |
| #define S_REG_CONSTANT(x) ((x) << 31) |
| #define G_REG_CONSTANT(x) ((x) >> 31) |
| |
| struct radv_perfcounter_impl { |
| enum radv_perfcounter_op op; |
| uint32_t regs[8]; |
| }; |
| |
| /* Only append to this list, never insert into the middle or remove (but can rename). |
| * |
| * The invariant we're trying to get here is counters that have the same meaning, so |
| * these can be shared between counters that have different implementations on different |
| * GPUs, but should be unique within a GPU. |
| */ |
| enum radv_perfcounter_uuid { |
| RADV_PC_UUID_GPU_CYCLES, |
| RADV_PC_UUID_SHADER_WAVES, |
| RADV_PC_UUID_SHADER_INSTRUCTIONS, |
| RADV_PC_UUID_SHADER_INSTRUCTIONS_VALU, |
| RADV_PC_UUID_SHADER_INSTRUCTIONS_SALU, |
| RADV_PC_UUID_SHADER_INSTRUCTIONS_VMEM_LOAD, |
| RADV_PC_UUID_SHADER_INSTRUCTIONS_SMEM_LOAD, |
| RADV_PC_UUID_SHADER_INSTRUCTIONS_VMEM_STORE, |
| RADV_PC_UUID_SHADER_INSTRUCTIONS_LDS, |
| RADV_PC_UUID_SHADER_INSTRUCTIONS_GDS, |
| RADV_PC_UUID_SHADER_VALU_BUSY, |
| RADV_PC_UUID_SHADER_SALU_BUSY, |
| RADV_PC_UUID_VRAM_READ_SIZE, |
| RADV_PC_UUID_VRAM_WRITE_SIZE, |
| RADV_PC_UUID_L0_CACHE_HIT_RATIO, |
| RADV_PC_UUID_L1_CACHE_HIT_RATIO, |
| RADV_PC_UUID_L2_CACHE_HIT_RATIO, |
| }; |
| |
| struct radv_perfcounter_desc { |
| struct radv_perfcounter_impl impl; |
| |
| VkPerformanceCounterUnitKHR unit; |
| |
| char name[VK_MAX_DESCRIPTION_SIZE]; |
| char category[VK_MAX_DESCRIPTION_SIZE]; |
| char description[VK_MAX_DESCRIPTION_SIZE]; |
| enum radv_perfcounter_uuid uuid; |
| }; |
| |
| #define PC_DESC(arg_op, arg_unit, arg_name, arg_category, arg_description, arg_uuid, ...) \ |
| (struct radv_perfcounter_desc) \ |
| { \ |
| .impl = {.op = arg_op, .regs = {__VA_ARGS__}}, \ |
| .unit = VK_PERFORMANCE_COUNTER_UNIT_##arg_unit##_KHR, .name = arg_name, \ |
| .category = arg_category, .description = arg_description, .uuid = RADV_PC_UUID_##arg_uuid \ |
| } |
| |
| #define ADD_PC(op, unit, name, category, description, uuid, ...) \ |
| do { \ |
| if (descs) { \ |
| descs[*count] = PC_DESC((op), unit, name, category, description, uuid, __VA_ARGS__); \ |
| } \ |
| ++*count; \ |
| } while (0) |
| #define CTR(block, ctr) (S_REG_BLOCK(block) | S_REG_SEL(ctr)) |
| #define CONSTANT(v) (S_REG_CONSTANT(1) | (uint32_t)(v)) |
| |
| enum { GRBM_PERF_SEL_GUI_ACTIVE = CTR(GRBM, 2) }; |
| |
| enum { CPF_PERF_SEL_CPF_STAT_BUSY_GFX10 = CTR(CPF, 0x18) }; |
| |
| enum { |
| GL1C_PERF_SEL_REQ = CTR(GL1C, 0xe), |
| GL1C_PERF_SEL_REQ_MISS = CTR(GL1C, 0x12), |
| }; |
| |
| enum { |
| GL2C_PERF_SEL_REQ = CTR(GL2C, 0x3), |
| |
| GL2C_PERF_SEL_MISS_GFX101 = CTR(GL2C, 0x23), |
| GL2C_PERF_SEL_MC_WRREQ_GFX101 = CTR(GL2C, 0x4b), |
| GL2C_PERF_SEL_EA_WRREQ_64B_GFX101 = CTR(GL2C, 0x4c), |
| GL2C_PERF_SEL_EA_RDREQ_32B_GFX101 = CTR(GL2C, 0x59), |
| GL2C_PERF_SEL_EA_RDREQ_64B_GFX101 = CTR(GL2C, 0x5a), |
| GL2C_PERF_SEL_EA_RDREQ_96B_GFX101 = CTR(GL2C, 0x5b), |
| GL2C_PERF_SEL_EA_RDREQ_128B_GFX101 = CTR(GL2C, 0x5c), |
| |
| GL2C_PERF_SEL_MISS_GFX103 = CTR(GL2C, 0x2b), |
| GL2C_PERF_SEL_MC_WRREQ_GFX103 = CTR(GL2C, 0x53), |
| GL2C_PERF_SEL_EA_WRREQ_64B_GFX103 = CTR(GL2C, 0x55), |
| GL2C_PERF_SEL_EA_RDREQ_32B_GFX103 = CTR(GL2C, 0x63), |
| GL2C_PERF_SEL_EA_RDREQ_64B_GFX103 = CTR(GL2C, 0x64), |
| GL2C_PERF_SEL_EA_RDREQ_96B_GFX103 = CTR(GL2C, 0x65), |
| GL2C_PERF_SEL_EA_RDREQ_128B_GFX103 = CTR(GL2C, 0x66), |
| }; |
| |
| enum { |
| SQ_PERF_SEL_WAVES = CTR(SQ, 0x4), |
| SQ_PERF_SEL_INSTS_ALL_GFX10 = CTR(SQ, 0x31), |
| SQ_PERF_SEL_INSTS_GDS_GFX10 = CTR(SQ, 0x37), |
| SQ_PERF_SEL_INSTS_LDS_GFX10 = CTR(SQ, 0x3b), |
| SQ_PERF_SEL_INSTS_SALU_GFX10 = CTR(SQ, 0x3c), |
| SQ_PERF_SEL_INSTS_SMEM_GFX10 = CTR(SQ, 0x3d), |
| SQ_PERF_SEL_INSTS_VALU_GFX10 = CTR(SQ, 0x40), |
| SQ_PERF_SEL_INSTS_TEX_LOAD_GFX10 = CTR(SQ, 0x45), |
| SQ_PERF_SEL_INSTS_TEX_STORE_GFX10 = CTR(SQ, 0x46), |
| SQ_PERF_SEL_INST_CYCLES_VALU_GFX10 = CTR(SQ, 0x75), |
| }; |
| |
| enum { |
| TCP_PERF_SEL_REQ_GFX10 = CTR(TCP, 0x9), |
| TCP_PERF_SEL_REQ_MISS_GFX10 = CTR(TCP, 0x12), |
| }; |
| |
| #define CTR_NUM_SIMD \ |
| CONSTANT(pdev->rad_info.num_simd_per_compute_unit * pdev->rad_info.num_cu) |
| #define CTR_NUM_CUS CONSTANT(pdev->rad_info.num_cu) |
| |
| static void |
| radv_query_perfcounter_descs(struct radv_physical_device *pdev, uint32_t *count, |
| struct radv_perfcounter_desc *descs) |
| { |
| *count = 0; |
| |
| ADD_PC(RADV_PC_OP_MAX, CYCLES, "GPU active cycles", "GRBM", |
| "cycles the GPU is active processing a command buffer.", GPU_CYCLES, |
| GRBM_PERF_SEL_GUI_ACTIVE); |
| |
| ADD_PC(RADV_PC_OP_SUM, GENERIC, "Waves", "Shaders", "Number of waves executed", SHADER_WAVES, |
| SQ_PERF_SEL_WAVES); |
| ADD_PC(RADV_PC_OP_SUM, GENERIC, "Instructions", "Shaders", "Number of Instructions executed", |
| SHADER_INSTRUCTIONS, SQ_PERF_SEL_INSTS_ALL_GFX10); |
| ADD_PC(RADV_PC_OP_SUM, GENERIC, "VALU Instructions", "Shaders", |
| "Number of VALU Instructions executed", SHADER_INSTRUCTIONS_VALU, |
| SQ_PERF_SEL_INSTS_VALU_GFX10); |
| ADD_PC(RADV_PC_OP_SUM, GENERIC, "SALU Instructions", "Shaders", |
| "Number of SALU Instructions executed", SHADER_INSTRUCTIONS_SALU, |
| SQ_PERF_SEL_INSTS_SALU_GFX10); |
| ADD_PC(RADV_PC_OP_SUM, GENERIC, "VMEM Load Instructions", "Shaders", |
| "Number of VMEM load instructions executed", SHADER_INSTRUCTIONS_VMEM_LOAD, |
| SQ_PERF_SEL_INSTS_TEX_LOAD_GFX10); |
| ADD_PC(RADV_PC_OP_SUM, GENERIC, "SMEM Load Instructions", "Shaders", |
| "Number of SMEM load instructions executed", SHADER_INSTRUCTIONS_SMEM_LOAD, |
| SQ_PERF_SEL_INSTS_SMEM_GFX10); |
| ADD_PC(RADV_PC_OP_SUM, GENERIC, "VMEM Store Instructions", "Shaders", |
| "Number of VMEM store instructions executed", SHADER_INSTRUCTIONS_VMEM_STORE, |
| SQ_PERF_SEL_INSTS_TEX_STORE_GFX10); |
| ADD_PC(RADV_PC_OP_SUM, GENERIC, "LDS Instructions", "Shaders", |
| "Number of LDS Instructions executed", SHADER_INSTRUCTIONS_LDS, |
| SQ_PERF_SEL_INSTS_LDS_GFX10); |
| ADD_PC(RADV_PC_OP_SUM, GENERIC, "GDS Instructions", "Shaders", |
| "Number of GDS Instructions executed", SHADER_INSTRUCTIONS_GDS, |
| SQ_PERF_SEL_INSTS_GDS_GFX10); |
| |
| ADD_PC(RADV_PC_OP_RATIO_DIVSCALE, PERCENTAGE, "VALU Busy", "Shader Utilization", |
| "Percentage of time the VALU units are busy", SHADER_VALU_BUSY, |
| SQ_PERF_SEL_INST_CYCLES_VALU_GFX10, CPF_PERF_SEL_CPF_STAT_BUSY_GFX10, CTR_NUM_SIMD); |
| ADD_PC(RADV_PC_OP_RATIO_DIVSCALE, PERCENTAGE, "SALU Busy", "Shader Utilization", |
| "Percentage of time the SALU units are busy", SHADER_SALU_BUSY, |
| SQ_PERF_SEL_INSTS_SALU_GFX10, CPF_PERF_SEL_CPF_STAT_BUSY_GFX10, CTR_NUM_CUS); |
| |
| if (pdev->rad_info.gfx_level >= GFX10_3) { |
| ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM read size", "Memory", |
| "Number of bytes read from VRAM", VRAM_READ_SIZE, GL2C_PERF_SEL_EA_RDREQ_32B_GFX103, |
| CONSTANT(32), GL2C_PERF_SEL_EA_RDREQ_64B_GFX103, CONSTANT(64), |
| GL2C_PERF_SEL_EA_RDREQ_96B_GFX103, CONSTANT(96), GL2C_PERF_SEL_EA_RDREQ_128B_GFX103, |
| CONSTANT(128)); |
| ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM write size", "Memory", |
| "Number of bytes written to VRAM", VRAM_WRITE_SIZE, GL2C_PERF_SEL_MC_WRREQ_GFX103, |
| CONSTANT(32), GL2C_PERF_SEL_EA_WRREQ_64B_GFX103, CONSTANT(64), CONSTANT(0), |
| CONSTANT(0), CONSTANT(0), CONSTANT(0)); |
| } else { |
| ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM read size", "Memory", |
| "Number of bytes read from VRAM", VRAM_READ_SIZE, GL2C_PERF_SEL_EA_RDREQ_32B_GFX101, |
| CONSTANT(32), GL2C_PERF_SEL_EA_RDREQ_64B_GFX101, CONSTANT(64), |
| GL2C_PERF_SEL_EA_RDREQ_96B_GFX101, CONSTANT(96), GL2C_PERF_SEL_EA_RDREQ_128B_GFX101, |
| CONSTANT(128)); |
| ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM write size", "Memory", |
| "Number of bytes written to VRAM", VRAM_WRITE_SIZE, GL2C_PERF_SEL_MC_WRREQ_GFX101, |
| CONSTANT(32), GL2C_PERF_SEL_EA_WRREQ_64B_GFX101, CONSTANT(32), CONSTANT(0), |
| CONSTANT(0), CONSTANT(0), CONSTANT(0)); |
| } |
| |
| ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L0 cache hit ratio", "Memory", "Hit ratio of L0 cache", |
| L0_CACHE_HIT_RATIO, TCP_PERF_SEL_REQ_MISS_GFX10, TCP_PERF_SEL_REQ_GFX10); |
| ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L1 cache hit ratio", "Memory", "Hit ratio of L1 cache", |
| L1_CACHE_HIT_RATIO, GL1C_PERF_SEL_REQ_MISS, GL1C_PERF_SEL_REQ); |
| if (pdev->rad_info.gfx_level >= GFX10_3) { |
| ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L2 cache hit ratio", "Memory", |
| "Hit ratio of L2 cache", L2_CACHE_HIT_RATIO, GL2C_PERF_SEL_MISS_GFX103, |
| GL2C_PERF_SEL_REQ); |
| } else { |
| ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L2 cache hit ratio", "Memory", |
| "Hit ratio of L2 cache", L2_CACHE_HIT_RATIO, GL2C_PERF_SEL_MISS_GFX101, |
| GL2C_PERF_SEL_REQ); |
| } |
| } |
| |
| static bool |
| radv_init_perfcounter_descs(struct radv_physical_device *pdev) |
| { |
| if (pdev->perfcounters) |
| return true; |
| |
| uint32_t count; |
| radv_query_perfcounter_descs(pdev, &count, NULL); |
| |
| struct radv_perfcounter_desc *descs = malloc(sizeof(*descs) * count); |
| if (!descs) |
| return false; |
| |
| radv_query_perfcounter_descs(pdev, &count, descs); |
| pdev->num_perfcounters = count; |
| pdev->perfcounters = descs; |
| |
| return true; |
| } |
| |
| static int |
| cmp_uint32_t(const void *a, const void *b) |
| { |
| uint32_t l = *(const uint32_t *)a; |
| uint32_t r = *(const uint32_t *)b; |
| |
| return (l < r) ? -1 : (l > r) ? 1 : 0; |
| } |
| |
| static VkResult |
| radv_get_counter_registers(const struct radv_physical_device *pdevice, uint32_t num_indices, |
| const uint32_t *indices, unsigned *out_num_regs, uint32_t **out_regs) |
| { |
| ASSERTED uint32_t num_counters = pdevice->num_perfcounters; |
| const struct radv_perfcounter_desc *descs = pdevice->perfcounters; |
| |
| unsigned full_reg_cnt = num_indices * ARRAY_SIZE(descs->impl.regs); |
| uint32_t *regs = malloc(full_reg_cnt * sizeof(uint32_t)); |
| if (!regs) |
| return VK_ERROR_OUT_OF_HOST_MEMORY; |
| |
| unsigned reg_cnt = 0; |
| for (unsigned i = 0; i < num_indices; ++i) { |
| uint32_t index = indices[i]; |
| assert(index < num_counters); |
| for (unsigned j = 0; j < ARRAY_SIZE(descs[index].impl.regs) && descs[index].impl.regs[j]; |
| ++j) { |
| if (!G_REG_CONSTANT(descs[index].impl.regs[j])) |
| regs[reg_cnt++] = descs[index].impl.regs[j]; |
| } |
| } |
| |
| qsort(regs, reg_cnt, sizeof(uint32_t), cmp_uint32_t); |
| |
| unsigned deduped_reg_cnt = 0; |
| for (unsigned i = 1; i < reg_cnt; ++i) { |
| if (regs[i] != regs[deduped_reg_cnt]) |
| regs[++deduped_reg_cnt] = regs[i]; |
| } |
| ++deduped_reg_cnt; |
| |
| *out_num_regs = deduped_reg_cnt; |
| *out_regs = regs; |
| return VK_SUCCESS; |
| } |
| |
| static unsigned |
| radv_pc_get_num_instances(const struct radv_physical_device *pdevice, struct ac_pc_block *ac_block) |
| { |
| return ac_block->num_instances * |
| ((ac_block->b->b->flags & AC_PC_BLOCK_SE) ? pdevice->rad_info.max_se : 1); |
| } |
| |
| static unsigned |
| radv_get_num_counter_passes(const struct radv_physical_device *pdevice, unsigned num_regs, |
| const uint32_t *regs) |
| { |
| enum ac_pc_gpu_block prev_block = NUM_GPU_BLOCK; |
| unsigned block_reg_count = 0; |
| struct ac_pc_block *ac_block = NULL; |
| unsigned passes_needed = 1; |
| |
| for (unsigned i = 0; i < num_regs; ++i) { |
| enum ac_pc_gpu_block block = G_REG_BLOCK(regs[i]); |
| |
| if (block != prev_block) { |
| block_reg_count = 0; |
| prev_block = block; |
| ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block); |
| } |
| |
| ++block_reg_count; |
| |
| passes_needed = |
| MAX2(passes_needed, DIV_ROUND_UP(block_reg_count, ac_block->b->b->num_counters)); |
| } |
| |
| return passes_needed; |
| } |
| |
| void |
| radv_pc_deinit_query_pool(struct radv_pc_query_pool *pool) |
| { |
| free(pool->counters); |
| free(pool->pc_regs); |
| } |
| |
| VkResult |
| radv_pc_init_query_pool(struct radv_physical_device *pdevice, |
| const VkQueryPoolCreateInfo *pCreateInfo, struct radv_pc_query_pool *pool) |
| { |
| const VkQueryPoolPerformanceCreateInfoKHR *perf_info = |
| vk_find_struct_const(pCreateInfo->pNext, QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR); |
| VkResult result; |
| |
| if (!radv_init_perfcounter_descs(pdevice)) |
| return VK_ERROR_OUT_OF_HOST_MEMORY; |
| |
| result = |
| radv_get_counter_registers(pdevice, perf_info->counterIndexCount, perf_info->pCounterIndices, |
| &pool->num_pc_regs, &pool->pc_regs); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| pool->num_passes = radv_get_num_counter_passes(pdevice, pool->num_pc_regs, pool->pc_regs); |
| |
| uint32_t *pc_reg_offsets = malloc(pool->num_pc_regs * sizeof(uint32_t)); |
| if (!pc_reg_offsets) |
| return VK_ERROR_OUT_OF_HOST_MEMORY; |
| |
| unsigned offset = 0; |
| for (unsigned i = 0; i < pool->num_pc_regs; ++i) { |
| enum ac_pc_gpu_block block = pool->pc_regs[i] >> 16; |
| struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block); |
| unsigned num_instances = radv_pc_get_num_instances(pdevice, ac_block); |
| |
| pc_reg_offsets[i] = S_REG_OFFSET(offset) | S_REG_INSTANCES(num_instances); |
| offset += sizeof(uint64_t) * 2 * num_instances; |
| } |
| |
| /* allow an uint32_t per pass to signal completion. */ |
| pool->b.stride = offset + 8 * pool->num_passes; |
| |
| pool->num_counters = perf_info->counterIndexCount; |
| pool->counters = malloc(pool->num_counters * sizeof(struct radv_perfcounter_impl)); |
| if (!pool->counters) { |
| free(pc_reg_offsets); |
| return VK_ERROR_OUT_OF_HOST_MEMORY; |
| } |
| |
| for (unsigned i = 0; i < pool->num_counters; ++i) { |
| pool->counters[i] = pdevice->perfcounters[perf_info->pCounterIndices[i]].impl; |
| |
| for (unsigned j = 0; j < ARRAY_SIZE(pool->counters[i].regs); ++j) { |
| uint32_t reg = pool->counters[i].regs[j]; |
| if (!reg || G_REG_CONSTANT(reg)) |
| continue; |
| |
| unsigned k; |
| for (k = 0; k < pool->num_pc_regs; ++k) |
| if (pool->pc_regs[k] == reg) |
| break; |
| pool->counters[i].regs[j] = pc_reg_offsets[k]; |
| } |
| } |
| |
| free(pc_reg_offsets); |
| return VK_SUCCESS; |
| } |
| |
| static void |
| radv_emit_instance(struct radv_cmd_buffer *cmd_buffer, int se, int instance) |
| { |
| struct radeon_cmdbuf *cs = cmd_buffer->cs; |
| unsigned value = S_030800_SH_BROADCAST_WRITES(1); |
| |
| if (se >= 0) { |
| value |= S_030800_SE_INDEX(se); |
| } else { |
| value |= S_030800_SE_BROADCAST_WRITES(1); |
| } |
| |
| if (instance >= 0) { |
| value |= S_030800_INSTANCE_INDEX(instance); |
| } else { |
| value |= S_030800_INSTANCE_BROADCAST_WRITES(1); |
| } |
| |
| radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value); |
| } |
| |
| static void |
| radv_emit_select(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count, |
| unsigned *selectors) |
| { |
| struct ac_pc_block_base *regs = block->b->b; |
| struct radeon_cmdbuf *cs = cmd_buffer->cs; |
| unsigned idx; |
| |
| assert(count <= regs->num_counters); |
| |
| /* Fake counters. */ |
| if (!regs->select0) |
| return; |
| |
| for (idx = 0; idx < count; ++idx) { |
| radeon_set_perfctr_reg(cmd_buffer, regs->select0[idx], |
| G_REG_SEL(selectors[idx]) | regs->select_or); |
| } |
| |
| for (idx = 0; idx < regs->num_spm_counters; idx++) { |
| radeon_set_uconfig_reg_seq(cs, regs->select1[idx], 1); |
| radeon_emit(cs, 0); |
| } |
| } |
| |
| static void |
| radv_pc_emit_block_instance_read(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, |
| unsigned count, uint64_t va) |
| { |
| struct ac_pc_block_base *regs = block->b->b; |
| struct radeon_cmdbuf *cs = cmd_buffer->cs; |
| unsigned reg = regs->counter0_lo; |
| unsigned reg_delta = 8; |
| |
| assert(regs->select0); |
| for (unsigned idx = 0; idx < count; ++idx) { |
| if (regs->counters) |
| reg = regs->counters[idx]; |
| |
| radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); |
| radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_TC_L2) | |
| COPY_DATA_WR_CONFIRM | COPY_DATA_COUNT_SEL); /* 64 bits */ |
| radeon_emit(cs, reg >> 2); |
| radeon_emit(cs, 0); /* unused */ |
| radeon_emit(cs, va); |
| radeon_emit(cs, va >> 32); |
| |
| va += sizeof(uint64_t) * 2 * |
| radv_pc_get_num_instances(cmd_buffer->device->physical_device, block); |
| reg += reg_delta; |
| } |
| } |
| |
| static void |
| radv_pc_sample_block(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count, |
| uint64_t va) |
| { |
| unsigned se_end = 1; |
| if (block->b->b->flags & AC_PC_BLOCK_SE) |
| se_end = cmd_buffer->device->physical_device->rad_info.max_se; |
| |
| for (unsigned se = 0; se < se_end; ++se) { |
| for (unsigned instance = 0; instance < block->num_instances; ++instance) { |
| radv_emit_instance(cmd_buffer, se, instance); |
| radv_pc_emit_block_instance_read(cmd_buffer, block, count, va); |
| va += sizeof(uint64_t) * 2; |
| } |
| } |
| } |
| |
| static void |
| radv_pc_wait_idle(struct radv_cmd_buffer *cmd_buffer) |
| { |
| struct radeon_cmdbuf *cs = cmd_buffer->cs; |
| |
| radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); |
| radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4))); |
| |
| radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0)); |
| radeon_emit(cs, 0); /* CP_COHER_CNTL */ |
| radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ |
| radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */ |
| radeon_emit(cs, 0); /* CP_COHER_BASE */ |
| radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ |
| radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ |
| radeon_emit(cs, 0); /* GCR_CNTL */ |
| |
| radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); |
| radeon_emit(cs, 0); |
| } |
| |
| static void |
| radv_pc_stop_and_sample(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool, |
| uint64_t va, bool end) |
| { |
| struct radeon_cmdbuf *cs = cmd_buffer->cs; |
| struct radv_physical_device *pdevice = cmd_buffer->device->physical_device; |
| |
| radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); |
| radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0)); |
| |
| radv_pc_wait_idle(cmd_buffer); |
| |
| radv_emit_instance(cmd_buffer, -1, -1); |
| radv_emit_windowed_counters(cmd_buffer->device, cs, cmd_buffer->qf, false); |
| |
| radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, |
| S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_STOP_COUNTING) | |
| S_036020_PERFMON_SAMPLE_ENABLE(1)); |
| |
| for (unsigned pass = 0; pass < pool->num_passes; ++pass) { |
| uint64_t pred_va = radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + |
| PERF_CTR_BO_PASS_OFFSET + 8 * pass; |
| uint64_t reg_va = va + (end ? 8 : 0); |
| |
| radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0)); |
| radeon_emit(cs, pred_va); |
| radeon_emit(cs, pred_va >> 32); |
| radeon_emit(cs, 0); /* Cache policy */ |
| |
| uint32_t *skip_dwords = cs->buf + cs->cdw; |
| radeon_emit(cs, 0); |
| |
| for (unsigned i = 0; i < pool->num_pc_regs;) { |
| enum ac_pc_gpu_block block = G_REG_BLOCK(pool->pc_regs[i]); |
| struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block); |
| unsigned offset = ac_block->num_instances * pass; |
| unsigned num_instances = radv_pc_get_num_instances(pdevice, ac_block); |
| |
| unsigned cnt = 1; |
| while (cnt < pool->num_pc_regs - i && block == G_REG_BLOCK(pool->pc_regs[i + cnt])) |
| ++cnt; |
| |
| if (offset < cnt) { |
| unsigned pass_reg_cnt = MIN2(cnt - offset, ac_block->b->b->num_counters); |
| radv_pc_sample_block(cmd_buffer, ac_block, pass_reg_cnt, |
| reg_va + offset * num_instances * sizeof(uint64_t)); |
| } |
| |
| i += cnt; |
| reg_va += num_instances * sizeof(uint64_t) * 2 * cnt; |
| } |
| |
| if (end) { |
| uint64_t signal_va = va + pool->b.stride - 8 - 8 * pass; |
| radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); |
| radeon_emit(cs, |
| S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME)); |
| radeon_emit(cs, signal_va); |
| radeon_emit(cs, signal_va >> 32); |
| radeon_emit(cs, 1); /* value */ |
| } |
| |
| *skip_dwords = cs->buf + cs->cdw - skip_dwords - 1; |
| } |
| |
| radv_emit_instance(cmd_buffer, -1, -1); |
| } |
| |
| void |
| radv_pc_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool, |
| uint64_t va) |
| { |
| struct radeon_cmdbuf *cs = cmd_buffer->cs; |
| struct radv_physical_device *pdevice = cmd_buffer->device->physical_device; |
| ASSERTED unsigned cdw_max; |
| |
| cmd_buffer->state.uses_perf_counters = true; |
| |
| cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, |
| 256 + /* Random one time stuff */ |
| 10 * pool->num_passes + /* COND_EXECs */ |
| pool->b.stride / 8 * (5 + 8)); |
| |
| radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->b.bo); |
| radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->device->perf_counter_bo); |
| |
| uint64_t perf_ctr_va = |
| radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + PERF_CTR_BO_FENCE_OFFSET; |
| radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); |
| radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME)); |
| radeon_emit(cs, perf_ctr_va); |
| radeon_emit(cs, perf_ctr_va >> 32); |
| radeon_emit(cs, 0); /* value */ |
| |
| radv_pc_wait_idle(cmd_buffer); |
| |
| radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, |
| S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET)); |
| |
| radv_emit_inhibit_clockgating(cmd_buffer->device, cs, true); |
| radv_emit_spi_config_cntl(cmd_buffer->device, cs, true); |
| radv_perfcounter_emit_shaders(cs, 0x7f); |
| |
| for (unsigned pass = 0; pass < pool->num_passes; ++pass) { |
| uint64_t pred_va = radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + |
| PERF_CTR_BO_PASS_OFFSET + 8 * pass; |
| |
| radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0)); |
| radeon_emit(cs, pred_va); |
| radeon_emit(cs, pred_va >> 32); |
| radeon_emit(cs, 0); /* Cache policy */ |
| |
| uint32_t *skip_dwords = cs->buf + cs->cdw; |
| radeon_emit(cs, 0); |
| |
| for (unsigned i = 0; i < pool->num_pc_regs;) { |
| enum ac_pc_gpu_block block = G_REG_BLOCK(pool->pc_regs[i]); |
| struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block); |
| unsigned offset = ac_block->num_instances * pass; |
| |
| unsigned cnt = 1; |
| while (cnt < pool->num_pc_regs - i && block == G_REG_BLOCK(pool->pc_regs[i + cnt])) |
| ++cnt; |
| |
| if (offset < cnt) { |
| unsigned pass_reg_cnt = MIN2(cnt - offset, ac_block->b->b->num_counters); |
| radv_emit_select(cmd_buffer, ac_block, pass_reg_cnt, pool->pc_regs + i + offset); |
| } |
| |
| i += cnt; |
| } |
| |
| *skip_dwords = cs->buf + cs->cdw - skip_dwords - 1; |
| } |
| |
| radv_emit_instance(cmd_buffer, -1, -1); |
| |
| /* The following sequence actually starts the perfcounters. */ |
| |
| radv_pc_stop_and_sample(cmd_buffer, pool, va, false); |
| |
| radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, |
| S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING)); |
| |
| radv_emit_windowed_counters(cmd_buffer->device, cs, cmd_buffer->qf, true); |
| |
| assert(cmd_buffer->cs->cdw <= cdw_max); |
| } |
| |
| void |
| radv_pc_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool, uint64_t va) |
| { |
| struct radeon_cmdbuf *cs = cmd_buffer->cs; |
| ASSERTED unsigned cdw_max; |
| |
| cdw_max = |
| radeon_check_space(cmd_buffer->device->ws, cs, |
| 256 + /* Reserved for things that don't scale with passes/counters */ |
| 5 * pool->num_passes + /* COND_EXECs */ |
| pool->b.stride / 8 * 8); |
| |
| radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->b.bo); |
| radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->device->perf_counter_bo); |
| |
| uint64_t perf_ctr_va = |
| radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + PERF_CTR_BO_FENCE_OFFSET; |
| si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, |
| radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS, 0, |
| EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, perf_ctr_va, 1, |
| cmd_buffer->gfx9_fence_va); |
| radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, perf_ctr_va, 1, 0xffffffff); |
| |
| radv_pc_wait_idle(cmd_buffer); |
| radv_pc_stop_and_sample(cmd_buffer, pool, va, true); |
| |
| radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, |
| S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET)); |
| radv_emit_spi_config_cntl(cmd_buffer->device, cs, false); |
| radv_emit_inhibit_clockgating(cmd_buffer->device, cs, false); |
| |
| assert(cmd_buffer->cs->cdw <= cdw_max); |
| } |
| |
| static uint64_t |
| radv_pc_sum_reg(uint32_t reg, const uint64_t *data) |
| { |
| unsigned instances = G_REG_INSTANCES(reg); |
| unsigned offset = G_REG_OFFSET(reg) / 8; |
| uint64_t result = 0; |
| |
| if (G_REG_CONSTANT(reg)) |
| return reg & 0x7fffffffu; |
| |
| for (unsigned i = 0; i < instances; ++i) { |
| result += data[offset + 2 * i + 1] - data[offset + 2 * i]; |
| } |
| |
| return result; |
| } |
| |
| static uint64_t |
| radv_pc_max_reg(uint32_t reg, const uint64_t *data) |
| { |
| unsigned instances = G_REG_INSTANCES(reg); |
| unsigned offset = G_REG_OFFSET(reg) / 8; |
| uint64_t result = 0; |
| |
| if (G_REG_CONSTANT(reg)) |
| return reg & 0x7fffffffu; |
| |
| for (unsigned i = 0; i < instances; ++i) { |
| result = MAX2(result, data[offset + 2 * i + 1]); |
| } |
| |
| return result; |
| } |
| |
| static union VkPerformanceCounterResultKHR |
| radv_pc_get_result(const struct radv_perfcounter_impl *impl, const uint64_t *data) |
| { |
| union VkPerformanceCounterResultKHR result; |
| |
| switch (impl->op) { |
| case RADV_PC_OP_MAX: |
| result.float64 = radv_pc_max_reg(impl->regs[0], data); |
| break; |
| case RADV_PC_OP_SUM: |
| result.float64 = radv_pc_sum_reg(impl->regs[0], data); |
| break; |
| case RADV_PC_OP_RATIO_DIVSCALE: |
| result.float64 = radv_pc_sum_reg(impl->regs[0], data) / |
| (double)radv_pc_sum_reg(impl->regs[1], data) / |
| radv_pc_sum_reg(impl->regs[2], data) * 100.0; |
| break; |
| case RADV_PC_OP_REVERSE_RATIO: { |
| double tmp = radv_pc_sum_reg(impl->regs[1], data); |
| result.float64 = (tmp - radv_pc_sum_reg(impl->regs[0], data)) / tmp * 100.0; |
| break; |
| } |
| case RADV_PC_OP_SUM_WEIGHTED_4: |
| result.float64 = 0.0; |
| for (unsigned i = 0; i < 4; ++i) |
| result.float64 += |
| radv_pc_sum_reg(impl->regs[2 * i], data) * radv_pc_sum_reg(impl->regs[2 * i + 1], data); |
| break; |
| default: |
| unreachable("unhandled performance counter operation"); |
| } |
| return result; |
| } |
| |
| void |
| radv_pc_get_results(const struct radv_pc_query_pool *pc_pool, const uint64_t *data, void *out) |
| { |
| union VkPerformanceCounterResultKHR *pc_result = out; |
| |
| for (unsigned i = 0; i < pc_pool->num_counters; ++i) { |
| pc_result[i] = radv_pc_get_result(pc_pool->counters + i, data); |
| } |
| } |
| |
| VKAPI_ATTR VkResult VKAPI_CALL |
| radv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( |
| VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, uint32_t *pCounterCount, |
| VkPerformanceCounterKHR *pCounters, VkPerformanceCounterDescriptionKHR *pCounterDescriptions) |
| { |
| RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); |
| |
| if (vk_queue_to_radv(pdevice, queueFamilyIndex) != RADV_QUEUE_GENERAL) { |
| *pCounterCount = 0; |
| return VK_SUCCESS; |
| } |
| |
| if (!radv_init_perfcounter_descs(pdevice)) |
| return VK_ERROR_OUT_OF_HOST_MEMORY; |
| |
| uint32_t counter_cnt = pdevice->num_perfcounters; |
| const struct radv_perfcounter_desc *descs = pdevice->perfcounters; |
| |
| if (!pCounters && !pCounterDescriptions) { |
| *pCounterCount = counter_cnt; |
| return VK_SUCCESS; |
| } |
| |
| VkResult result = counter_cnt > *pCounterCount ? VK_INCOMPLETE : VK_SUCCESS; |
| counter_cnt = MIN2(counter_cnt, *pCounterCount); |
| *pCounterCount = counter_cnt; |
| |
| for (uint32_t i = 0; i < counter_cnt; ++i) { |
| if (pCounters) { |
| pCounters[i].sType = VK_STRUCTURE_TYPE_PERFORMANCE_COUNTER_KHR; |
| pCounters[i].unit = descs[i].unit; |
| pCounters[i].scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR; |
| pCounters[i].storage = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT64_KHR; |
| |
| memset(&pCounters[i].uuid, 0, sizeof(pCounters[i].uuid)); |
| strcpy((char*)&pCounters[i].uuid, "RADV"); |
| |
| const uint32_t uuid = descs[i].uuid; |
| memcpy(&pCounters[i].uuid[12], &uuid, sizeof(uuid)); |
| } |
| |
| if (pCounterDescriptions) { |
| pCounterDescriptions[i].sType = VK_STRUCTURE_TYPE_PERFORMANCE_COUNTER_DESCRIPTION_KHR; |
| pCounterDescriptions[i].flags = |
| VK_PERFORMANCE_COUNTER_DESCRIPTION_CONCURRENTLY_IMPACTED_BIT_KHR; |
| strcpy(pCounterDescriptions[i].name, descs[i].name); |
| strcpy(pCounterDescriptions[i].category, descs[i].category); |
| strcpy(pCounterDescriptions[i].description, descs[i].description); |
| } |
| } |
| return result; |
| } |
| |
| VKAPI_ATTR void VKAPI_CALL |
| radv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR( |
| VkPhysicalDevice physicalDevice, |
| const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo, uint32_t *pNumPasses) |
| { |
| RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); |
| |
| if (pPerformanceQueryCreateInfo->counterIndexCount == 0) { |
| *pNumPasses = 0; |
| return; |
| } |
| |
| if (!radv_init_perfcounter_descs(pdevice)) { |
| /* Can't return an error, so log */ |
| fprintf(stderr, "radv: Failed to init perf counters\n"); |
| *pNumPasses = 1; |
| return; |
| } |
| |
| assert(vk_queue_to_radv(pdevice, pPerformanceQueryCreateInfo->queueFamilyIndex) == |
| RADV_QUEUE_GENERAL); |
| |
| unsigned num_regs = 0; |
| uint32_t *regs = NULL; |
| VkResult result = |
| radv_get_counter_registers(pdevice, pPerformanceQueryCreateInfo->counterIndexCount, |
| pPerformanceQueryCreateInfo->pCounterIndices, &num_regs, ®s); |
| if (result != VK_SUCCESS) { |
| /* Can't return an error, so log */ |
| fprintf(stderr, "radv: Failed to allocate memory for perf counters\n"); |
| } |
| |
| *pNumPasses = radv_get_num_counter_passes(pdevice, num_regs, regs); |
| free(regs); |
| } |