| /* |
| * Copyright 2024 Intel Corporation |
| * SPDX-License-Identifier: MIT |
| */ |
| |
| #include "perf/xe/intel_perf.h" |
| |
| #include <fcntl.h> |
| #include <sys/stat.h> |
| |
| #include "perf/intel_perf.h" |
| #include "perf/intel_perf_common.h" |
| #include "intel/common/intel_gem.h" |
| #include "intel/common/xe/intel_device_query.h" |
| #include "intel/common/xe/intel_queue.h" |
| |
| #include "drm-uapi/xe_drm.h" |
| |
| #define FIELD_PREP_ULL(_mask, _val) (((_val) << (ffsll(_mask) - 1)) & (_mask)) |
| |
| /* |
| * EU stall data format for Xe2 arch GPUs (LNL, BMG). |
| */ |
| struct xe_eu_stall_data_xe2 { |
| uint64_t ip_addr:29; /* Bits 0 to 28 */ |
| uint64_t tdr_count:8; /* Bits 29 to 36 */ |
| uint64_t other_count:8; /* Bits 37 to 44 */ |
| uint64_t control_count:8; /* Bits 45 to 52 */ |
| uint64_t pipestall_count:8; /* Bits 53 to 60 */ |
| uint64_t send_count:8; /* Bits 61 to 68 */ |
| uint64_t dist_acc_count:8; /* Bits 69 to 76 */ |
| uint64_t sbid_count:8; /* Bits 77 to 84 */ |
| uint64_t sync_count:8; /* Bits 85 to 92 */ |
| uint64_t inst_fetch_count:8; /* Bits 93 to 100 */ |
| uint64_t active_count:8; /* Bits 101 to 108 */ |
| uint64_t ex_id:3; /* Bits 109 to 111 */ |
| uint64_t end_flag:1; /* Bit 112 */ |
| uint64_t unused_bits:15; |
| uint64_t unused[6]; |
| } __packed; |
| |
| uint64_t xe_perf_get_oa_format(struct intel_perf_config *perf) |
| { |
| uint64_t fmt; |
| |
| if (perf->devinfo->verx10 >= 200) { |
| /* BSpec: 60942 |
| * PEC64u64 |
| */ |
| fmt = FIELD_PREP_ULL(DRM_XE_OA_FORMAT_MASK_FMT_TYPE, DRM_XE_OA_FMT_TYPE_PEC); |
| fmt |= FIELD_PREP_ULL(DRM_XE_OA_FORMAT_MASK_COUNTER_SEL, 1); |
| fmt |= FIELD_PREP_ULL(DRM_XE_OA_FORMAT_MASK_COUNTER_SIZE, 1); |
| fmt |= FIELD_PREP_ULL(DRM_XE_OA_FORMAT_MASK_BC_REPORT, 0); |
| } else { |
| /* BSpec: 52198 |
| * same as I915_OA_FORMAT_A24u40_A14u32_B8_C8 and |
| * I915_OA_FORMAT_A32u40_A4u32_B8_C8 returned for gfx 125+ and gfx 120 |
| * respectively. |
| */ |
| fmt = FIELD_PREP_ULL(DRM_XE_OA_FORMAT_MASK_FMT_TYPE, DRM_XE_OA_FMT_TYPE_OAG); |
| fmt |= FIELD_PREP_ULL(DRM_XE_OA_FORMAT_MASK_COUNTER_SEL, 5); |
| fmt |= FIELD_PREP_ULL(DRM_XE_OA_FORMAT_MASK_COUNTER_SIZE, 0); |
| fmt |= FIELD_PREP_ULL(DRM_XE_OA_FORMAT_MASK_BC_REPORT, 0); |
| } |
| |
| return fmt; |
| } |
| |
| bool |
| xe_oa_metrics_available(struct intel_perf_config *perf, int fd, bool use_register_snapshots) |
| { |
| struct drm_xe_query_oa_units *oa_units; |
| bool perf_oa_available = false; |
| struct stat sb; |
| |
| /* The existence of this file implies that this Xe KMD version supports |
| * observation interface. |
| */ |
| if (stat("/proc/sys/dev/xe/observation_paranoid", &sb) == 0) { |
| uint64_t paranoid = 1; |
| |
| /* Now we need to check if application has privileges to access observation |
| * interface. |
| * |
| * TODO: this approach does not takes into account applications running |
| * with CAP_PERFMON privileges. |
| */ |
| read_file_uint64("/proc/sys/dev/xe/observation_paranoid", ¶noid); |
| if (paranoid == 0 || geteuid() == 0) |
| perf_oa_available = true; |
| } |
| |
| if (!perf_oa_available) |
| return perf_oa_available; |
| |
| perf->features_supported |= INTEL_PERF_FEATURE_HOLD_PREEMPTION; |
| |
| oa_units = xe_device_query_alloc_fetch(fd, DRM_XE_DEVICE_QUERY_OA_UNITS, NULL); |
| if (oa_units) { |
| uint8_t *poau; |
| uint32_t i; |
| |
| poau = (uint8_t *)oa_units->oa_units; |
| for (i = 0; i < oa_units->num_oa_units; i++) { |
| struct drm_xe_oa_unit *oa_unit = (struct drm_xe_oa_unit *)poau; |
| uint32_t engine_i; |
| bool render_found = false; |
| |
| for (engine_i = 0; engine_i < oa_unit->num_engines; engine_i++) { |
| if (oa_unit->eci[engine_i].engine_class == DRM_XE_ENGINE_CLASS_RENDER) { |
| render_found = true; |
| break; |
| } |
| } |
| |
| if (!render_found) |
| continue; |
| |
| if (oa_unit->capabilities & DRM_XE_OA_CAPS_SYNCS) { |
| perf->features_supported |= INTEL_PERF_FEATURE_METRIC_SYNC; |
| break; |
| } |
| poau += sizeof(*oa_unit) + oa_unit->num_engines * sizeof(oa_unit->eci[0]); |
| } |
| |
| free(oa_units); |
| } |
| |
| return perf_oa_available; |
| } |
| |
| uint64_t |
| xe_add_config(struct intel_perf_config *perf, int fd, |
| const struct intel_perf_registers *config, |
| const char *guid) |
| { |
| struct drm_xe_oa_config xe_config = {}; |
| struct drm_xe_observation_param observation_param = { |
| .observation_type = DRM_XE_OBSERVATION_TYPE_OA, |
| .observation_op = DRM_XE_OBSERVATION_OP_ADD_CONFIG, |
| .param = (uintptr_t)&xe_config, |
| }; |
| uint32_t *regs; |
| int ret; |
| |
| memcpy(xe_config.uuid, guid, sizeof(xe_config.uuid)); |
| |
| xe_config.n_regs = config->n_mux_regs + config->n_b_counter_regs + config->n_flex_regs; |
| assert(xe_config.n_regs > 0); |
| |
| regs = malloc(sizeof(uint64_t) * xe_config.n_regs); |
| xe_config.regs_ptr = (uintptr_t)regs; |
| |
| memcpy(regs, config->mux_regs, config->n_mux_regs * sizeof(uint64_t)); |
| regs += 2 * config->n_mux_regs; |
| memcpy(regs, config->b_counter_regs, config->n_b_counter_regs * sizeof(uint64_t)); |
| regs += 2 * config->n_b_counter_regs; |
| memcpy(regs, config->flex_regs, config->n_flex_regs * sizeof(uint64_t)); |
| |
| ret = intel_ioctl(fd, DRM_IOCTL_XE_OBSERVATION, &observation_param); |
| free((void*)(uintptr_t)xe_config.regs_ptr); |
| return ret > 0 ? ret : 0; |
| } |
| |
| void |
| xe_remove_config(struct intel_perf_config *perf, int fd, uint64_t config_id) |
| { |
| struct drm_xe_observation_param observation_param = { |
| .observation_type = DRM_XE_OBSERVATION_TYPE_OA, |
| .observation_op = DRM_XE_OBSERVATION_OP_REMOVE_CONFIG, |
| .param = (uintptr_t)&config_id, |
| }; |
| |
| intel_ioctl(fd, DRM_IOCTL_XE_OBSERVATION, &observation_param); |
| } |
| |
| static void |
| oa_prop_set(struct drm_xe_ext_set_property *props, uint32_t *index, |
| enum drm_xe_oa_property_id prop_id, uint64_t value) |
| { |
| if (*index > 0) |
| props[*index - 1].base.next_extension = (uintptr_t)&props[*index]; |
| |
| props[*index].base.name = DRM_XE_OA_EXTENSION_SET_PROPERTY; |
| props[*index].property = prop_id; |
| props[*index].value = value; |
| *index = *index + 1; |
| } |
| |
| int |
| xe_perf_stream_open(struct intel_perf_config *perf_config, int drm_fd, |
| uint32_t exec_id, uint64_t metrics_set_id, |
| uint64_t report_format, uint64_t period_exponent, |
| bool hold_preemption, bool enable, |
| struct intel_bind_timeline *timeline) |
| { |
| struct drm_xe_ext_set_property props[DRM_XE_OA_PROPERTY_NO_PREEMPT + 1] = {}; |
| struct drm_xe_observation_param observation_param = { |
| .observation_type = DRM_XE_OBSERVATION_TYPE_OA, |
| .observation_op = DRM_XE_OBSERVATION_OP_STREAM_OPEN, |
| .param = (uintptr_t)&props, |
| }; |
| struct drm_xe_sync sync = { |
| .type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ, |
| .flags = DRM_XE_SYNC_FLAG_SIGNAL, |
| }; |
| uint32_t i = 0; |
| int fd, flags; |
| |
| if (exec_id) |
| oa_prop_set(props, &i, DRM_XE_OA_PROPERTY_EXEC_QUEUE_ID, exec_id); |
| oa_prop_set(props, &i, DRM_XE_OA_PROPERTY_OA_DISABLED, !enable); |
| oa_prop_set(props, &i, DRM_XE_OA_PROPERTY_SAMPLE_OA, true); |
| oa_prop_set(props, &i, DRM_XE_OA_PROPERTY_OA_METRIC_SET, metrics_set_id); |
| oa_prop_set(props, &i, DRM_XE_OA_PROPERTY_OA_FORMAT, report_format); |
| oa_prop_set(props, &i, DRM_XE_OA_PROPERTY_OA_PERIOD_EXPONENT, period_exponent); |
| if (hold_preemption) |
| oa_prop_set(props, &i, DRM_XE_OA_PROPERTY_NO_PREEMPT, hold_preemption); |
| |
| if (timeline && intel_bind_timeline_get_syncobj(timeline)) { |
| oa_prop_set(props, &i, DRM_XE_OA_PROPERTY_NUM_SYNCS, 1); |
| oa_prop_set(props, &i, DRM_XE_OA_PROPERTY_SYNCS, (uintptr_t)&sync); |
| |
| sync.handle = intel_bind_timeline_get_syncobj(timeline); |
| sync.timeline_value = intel_bind_timeline_bind_begin(timeline); |
| fd = intel_ioctl(drm_fd, DRM_IOCTL_XE_OBSERVATION, &observation_param); |
| intel_bind_timeline_bind_end(timeline); |
| } else { |
| fd = intel_ioctl(drm_fd, DRM_IOCTL_XE_OBSERVATION, &observation_param); |
| } |
| |
| if (fd < 0) |
| return fd; |
| |
| flags = fcntl(fd, F_GETFL, 0); |
| flags |= O_CLOEXEC | O_NONBLOCK; |
| if (fcntl(fd, F_SETFL, flags)) { |
| close(fd); |
| return -1; |
| } |
| |
| return fd; |
| } |
| |
| int |
| xe_perf_stream_set_state(int perf_stream_fd, bool enable) |
| { |
| unsigned long uapi = enable ? DRM_XE_OBSERVATION_IOCTL_ENABLE : |
| DRM_XE_OBSERVATION_IOCTL_DISABLE; |
| |
| return intel_ioctl(perf_stream_fd, uapi, 0); |
| } |
| |
| int |
| xe_perf_stream_set_metrics_id(int perf_stream_fd, int drm_fd, |
| uint32_t exec_queue, uint64_t metrics_set_id, |
| struct intel_bind_timeline *timeline) |
| { |
| struct drm_xe_ext_set_property prop[3] = {}; |
| uint32_t index = 0; |
| int ret; |
| |
| oa_prop_set(prop, &index, DRM_XE_OA_PROPERTY_OA_METRIC_SET, |
| metrics_set_id); |
| |
| if (timeline && intel_bind_timeline_get_syncobj(timeline)) { |
| struct drm_xe_sync xe_syncs[3] = {}; |
| uint32_t syncobj; |
| int ret2; |
| |
| oa_prop_set(prop, &index, DRM_XE_OA_PROPERTY_NUM_SYNCS, ARRAY_SIZE(xe_syncs)); |
| oa_prop_set(prop, &index, DRM_XE_OA_PROPERTY_SYNCS, (uintptr_t)xe_syncs); |
| |
| /* wait on all previous exec in queues */ |
| ret = xe_queue_get_syncobj_for_idle(drm_fd, exec_queue, &syncobj); |
| if (ret) |
| return ret; |
| xe_syncs[0].type = DRM_XE_SYNC_TYPE_SYNCOBJ; |
| xe_syncs[0].flags = 0;/* wait */ |
| xe_syncs[0].handle = syncobj; |
| |
| /* wait on previous set_metrics_id to complete */ |
| xe_syncs[1].type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ; |
| xe_syncs[1].flags = 0;/* wait */ |
| xe_syncs[1].handle = intel_bind_timeline_get_syncobj(timeline); |
| xe_syncs[1].timeline_value = intel_bind_timeline_get_last_point(timeline); |
| |
| /* signal completion */ |
| xe_syncs[2].type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ; |
| xe_syncs[2].flags = DRM_XE_SYNC_FLAG_SIGNAL; |
| xe_syncs[2].handle = intel_bind_timeline_get_syncobj(timeline); |
| xe_syncs[2].timeline_value = intel_bind_timeline_bind_begin(timeline); |
| |
| ret = intel_ioctl(perf_stream_fd, DRM_XE_OBSERVATION_IOCTL_CONFIG, |
| (void *)(uintptr_t)&prop); |
| intel_bind_timeline_bind_end(timeline); |
| |
| /* Looks safe to destroy as Xe KMD should increase the ref count until |
| * it is using it |
| */ |
| struct drm_syncobj_destroy syncobj_destroy = { |
| .handle = syncobj, |
| }; |
| ret2 = intel_ioctl(drm_fd, DRM_IOCTL_SYNCOBJ_DESTROY, &syncobj_destroy); |
| assert(ret2 == 0); |
| } else { |
| ret = intel_ioctl(perf_stream_fd, DRM_XE_OBSERVATION_IOCTL_CONFIG, |
| (void *)(uintptr_t)&prop); |
| } |
| |
| return ret; |
| } |
| |
| static int |
| xe_perf_stream_read_error(int perf_stream_fd, uint8_t *buffer) |
| { |
| struct drm_xe_oa_stream_status status = {}; |
| struct intel_perf_record_header *header; |
| int ret; |
| |
| ret = intel_ioctl(perf_stream_fd, DRM_XE_OBSERVATION_IOCTL_STATUS, &status); |
| if (ret) |
| return -errno; |
| |
| header = (struct intel_perf_record_header *)buffer; |
| header->pad = 0; |
| header->type = 0; |
| header->size = sizeof(*header); |
| ret = header->size; |
| |
| if (status.oa_status & INTEL_PERF_RECORD_TYPE_OA_BUFFER_LOST) |
| header->type = INTEL_PERF_RECORD_TYPE_OA_BUFFER_LOST; |
| else if (status.oa_status & DRM_XE_OASTATUS_REPORT_LOST) |
| header->type = INTEL_PERF_RECORD_TYPE_OA_REPORT_LOST; |
| else if (status.oa_status & DRM_XE_OASTATUS_COUNTER_OVERFLOW) |
| header->type = INTEL_PERF_RECORD_TYPE_COUNTER_OVERFLOW; |
| else if (status.oa_status & DRM_XE_OASTATUS_MMIO_TRG_Q_FULL) |
| header->type = INTEL_PERF_RECORD_TYPE_MMIO_TRG_Q_FULL; |
| else |
| unreachable("missing"); |
| |
| return header->type ? header->size : -1; |
| } |
| |
| int |
| xe_perf_stream_read_samples(struct intel_perf_config *perf_config, int perf_stream_fd, |
| uint8_t *buffer, size_t buffer_len) |
| { |
| const size_t sample_size = perf_config->oa_sample_size; |
| const size_t sample_header_size = sample_size + sizeof(struct intel_perf_record_header); |
| uint32_t num_samples = buffer_len / sample_header_size; |
| const size_t max_bytes_read = num_samples * sample_size; |
| uint8_t *offset, *offset_samples; |
| int len, i; |
| |
| if (buffer_len < sample_header_size) |
| return -ENOSPC; |
| |
| do { |
| len = read(perf_stream_fd, buffer, max_bytes_read); |
| } while (len < 0 && errno == EINTR); |
| |
| if (len <= 0) { |
| if (errno == EIO) |
| return xe_perf_stream_read_error(perf_stream_fd, buffer); |
| |
| return len < 0 ? -errno : 0; |
| } |
| |
| num_samples = len / sample_size; |
| offset = buffer; |
| offset_samples = buffer + (buffer_len - len); |
| /* move all samples to the end of buffer */ |
| memmove(offset_samples, buffer, len); |
| |
| /* setup header, then copy sample from the end of buffer */ |
| for (i = 0; i < num_samples; i++) { |
| struct intel_perf_record_header *header = (struct intel_perf_record_header *)offset; |
| |
| /* TODO: also append REPORT_LOST and BUFFER_LOST */ |
| header->type = INTEL_PERF_RECORD_TYPE_SAMPLE; |
| header->pad = 0; |
| header->size = sample_header_size; |
| offset += sizeof(*header); |
| |
| memmove(offset, offset_samples, sample_size); |
| offset += sample_size; |
| offset_samples += sample_size; |
| } |
| |
| return offset - buffer; |
| } |
| |
| static int |
| first_rendering_gt_id(int drm_fd) { |
| struct intel_query_engine_info *engine_info = |
| intel_engine_get_info(drm_fd, INTEL_KMD_TYPE_XE); |
| for (int i = 0; i < engine_info->num_engines; i++) { |
| if (engine_info->engines[i].engine_class == INTEL_ENGINE_CLASS_RENDER) |
| return engine_info->engines[i].gt_id; |
| } |
| return -1; |
| } |
| |
| int |
| xe_perf_eustall_stream_open(int drm_fd, uint32_t sample_rate, |
| uint32_t min_event_count) |
| { |
| struct drm_xe_ext_set_property props[DRM_XE_EU_STALL_PROP_WAIT_NUM_REPORTS + 1] = {}; |
| struct drm_xe_observation_param observation_param = { |
| .observation_type = DRM_XE_OBSERVATION_TYPE_EU_STALL, |
| .observation_op = DRM_XE_OBSERVATION_OP_STREAM_OPEN, |
| .param = (uintptr_t)&props, |
| }; |
| uint32_t i = 0; |
| int fd, flags; |
| int gt_id = first_rendering_gt_id(drm_fd); |
| assert(gt_id >= 0); |
| |
| oa_prop_set(props, &i, DRM_XE_EU_STALL_PROP_SAMPLE_RATE, sample_rate); |
| oa_prop_set(props, &i, DRM_XE_EU_STALL_PROP_WAIT_NUM_REPORTS, min_event_count); |
| oa_prop_set(props, &i, DRM_XE_EU_STALL_PROP_GT_ID, gt_id); |
| |
| fd = intel_ioctl(drm_fd, DRM_IOCTL_XE_OBSERVATION, &observation_param); |
| if (fd < 0) |
| return -errno; |
| |
| flags = fcntl(fd, F_GETFL, 0); |
| flags |= O_CLOEXEC | O_NONBLOCK; |
| if (fcntl(fd, F_SETFL, flags)) { |
| close(fd); |
| return -1; |
| } |
| |
| return fd; |
| } |
| |
| int |
| xe_perf_eustall_stream_record_size(int drm_fd) |
| { |
| int record_size; |
| struct drm_xe_query_eu_stall *eu_stall_data = |
| xe_device_query_alloc_fetch(drm_fd, DRM_XE_DEVICE_QUERY_EU_STALL, NULL); |
| if (!eu_stall_data) |
| return -errno; |
| |
| assert(eu_stall_data->record_size > 0 && |
| eu_stall_data->record_size < INT_MAX); |
| record_size = (int)eu_stall_data->record_size; |
| free(eu_stall_data); |
| return record_size; |
| } |
| |
| int |
| xe_perf_eustall_stream_sample_rate(int drm_fd) |
| { |
| struct drm_xe_query_eu_stall *eu_stall_data = |
| xe_device_query_alloc_fetch(drm_fd, DRM_XE_DEVICE_QUERY_EU_STALL, NULL); |
| if (!eu_stall_data) |
| return -errno; |
| |
| assert(eu_stall_data->sampling_rates[0] > 0 && |
| eu_stall_data->sampling_rates[0] < INT_MAX); |
| /* pick slowest rate to reduce chance of overflow */ |
| int idx_slowest = eu_stall_data->num_sampling_rates - 1; |
| int sampling_rate = (int)eu_stall_data->sampling_rates[idx_slowest]; |
| free(eu_stall_data); |
| return sampling_rate; |
| } |
| |
| int |
| xe_perf_eustall_stream_read_samples(int perf_stream_fd, uint8_t *buffer, |
| size_t buffer_len, bool *overflow) |
| { |
| int len; |
| |
| *overflow = false; |
| do { |
| len = read(perf_stream_fd, buffer, buffer_len); |
| if (unlikely(len < 0 && errno == EIO)) |
| *overflow = true; |
| } while (len < 0 && (errno == EINTR || errno == EIO)); |
| |
| if (unlikely(len < 0 && errno == EAGAIN)) |
| len = 0; |
| |
| return len < 0 ? -errno : len; |
| } |
| |
| void |
| xe_perf_eustall_accumulate_results(struct intel_perf_query_eustall_result *result, |
| const uint8_t *start, const uint8_t *end, |
| size_t record_size) |
| { |
| const uint8_t *offset; |
| assert(((end - start) % record_size) == 0); |
| |
| for (offset = start; offset < end; offset += record_size) { |
| const struct xe_eu_stall_data_xe2* stall_data = |
| (const struct xe_eu_stall_data_xe2*)offset; |
| struct intel_perf_query_eustall_event* stall_result; |
| uint64_t ip_addr = stall_data->ip_addr; |
| struct hash_entry *e = _mesa_hash_table_search(result->accumulator, |
| (const void*)&ip_addr); |
| if (e) { |
| stall_result = e->data; |
| } else { |
| stall_result = calloc(1, sizeof(struct intel_perf_query_eustall_event)); |
| stall_result->ip_addr = ip_addr; |
| _mesa_hash_table_insert(result->accumulator, |
| (const void*)&stall_result->ip_addr, |
| stall_result); |
| } |
| assert(stall_result->ip_addr == stall_data->ip_addr); |
| |
| stall_result->tdr_count += stall_data->tdr_count; |
| stall_result->other_count += stall_data->other_count; |
| stall_result->control_count += stall_data->control_count; |
| stall_result->pipestall_count += stall_data->pipestall_count; |
| stall_result->send_count += stall_data->send_count; |
| stall_result->dist_acc_count += stall_data->dist_acc_count; |
| stall_result->sbid_count += stall_data->sbid_count; |
| stall_result->sync_count += stall_data->sync_count; |
| stall_result->inst_fetch_count += stall_data->inst_fetch_count; |
| stall_result->active_count += stall_data->active_count; |
| |
| result->records_accumulated++; |
| } |
| } |