/*
 * Copyright (c) 2017-2020 ARM Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#include "mali_profiler_magma.h"

#include "hwcpipe_log.h"

#include "magma.h"
#include "magma_arm_mali_types.h"
#include "magma_vendor_queries.h"
#include <algorithm>
#include <stdexcept>

#include <filesystem>
#include <lib/fdio/directory.h>
#include <lib/zx/channel.h>

using mali_userspace::MALI_NAME_BLOCK_JM;
using mali_userspace::MALI_NAME_BLOCK_MMU;
using mali_userspace::MALI_NAME_BLOCK_SHADER;
using mali_userspace::MALI_NAME_BLOCK_TILER;

namespace hwcpipe
{
namespace
{
struct MaliHWInfo
{
	unsigned mp_count;
	unsigned gpu_id;
	unsigned r_value;
	unsigned p_value;
	unsigned core_mask;
	unsigned l2_slices;
};

static uint32_t extract_bits(uint64_t input, uint32_t shift, uint32_t width)
{
	return (input >> shift) & ((1 << width) - 1);
}
}        // namespace

typedef std::function<uint64_t(void)> MaliValueGetter;

MaliProfilerMagma::MaliProfilerMagma(const GpuCounterSet &enabled_counters) :
    enabled_counters_(enabled_counters)
{
	// Throws if setup fails
	init();

	const std::unordered_map<GpuCounter, MaliValueGetter, GpuCounterHash> valhall_mappings = {
	    {GpuCounter::GpuCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "GPU_ACTIVE"); }},
	    {GpuCounter::VertexComputeCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS1_ACTIVE"); }},
	    {GpuCounter::FragmentCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_ACTIVE"); }},
	    {GpuCounter::TilerCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_TILER, "TILER_ACTIVE"); }},

	    {GpuCounter::VertexComputeJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS1_JOBS"); }},
	    {GpuCounter::FragmentJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_JOBS"); }},
	    {GpuCounter::Pixels, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_TASKS") * 1024; }},

	    {GpuCounter::Tiles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_PTILES"); }},
	    {GpuCounter::TransactionEliminations, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_TRANS_ELIM"); }},
	    {GpuCounter::EarlyZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_TEST"); }},
	    {GpuCounter::EarlyZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_KILL"); }},
	    {GpuCounter::LateZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_LZS_TEST"); }},
	    {GpuCounter::LateZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_LZS_KILL"); }},

	    {GpuCounter::Instructions, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_FMA") + get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_CVT") + get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_SFU") + get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_MSG"); }},
	    {GpuCounter::DivergedInstructions, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_DIVERGED"); }},

	    {GpuCounter::ShaderCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_CORE_ACTIVE"); }},
	    // The three units run in parallel so we can approximate cycles by taking the largest value. SFU instructions use 4 cycles per warp.
	    {GpuCounter::ShaderArithmeticCycles, [this] { return std::max(get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_FMA"), std::max(get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_CVT"), 4 * get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_SFU"))); }},
	    {GpuCounter::ShaderLoadStoreCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_READ_FULL") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_WRITE_FULL") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_READ_SHORT") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_WRITE_SHORT") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_ATOMIC"); }},
	    {GpuCounter::ShaderTextureCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "TEX_FILT_NUM_OPERATIONS"); }},

	    {GpuCounter::CacheReadLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_READ_LOOKUP"); }},
	    {GpuCounter::CacheWriteLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_WRITE_LOOKUP"); }},
	    {GpuCounter::ExternalMemoryReadAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ"); }},
	    {GpuCounter::ExternalMemoryWriteAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE"); }},
	    {GpuCounter::ExternalMemoryReadStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_AR_STALL"); }},
	    {GpuCounter::ExternalMemoryWriteStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_W_STALL"); }},
	    {GpuCounter::ExternalMemoryReadBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ_BEATS") * 16; }},
	    {GpuCounter::ExternalMemoryWriteBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE_BEATS") * 16; }},
	};

	const std::unordered_map<GpuCounter, MaliValueGetter, GpuCounterHash> bifrost_mappings = {
	    {GpuCounter::GpuCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "GPU_ACTIVE"); }},
	    {GpuCounter::VertexComputeCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS1_ACTIVE"); }},
	    {GpuCounter::FragmentCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_ACTIVE"); }},
	    {GpuCounter::TilerCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_TILER, "TILER_ACTIVE"); }},

	    {GpuCounter::VertexComputeJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS1_JOBS"); }},
	    {GpuCounter::FragmentJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_JOBS"); }},
	    {GpuCounter::Pixels, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_TASKS") * 1024; }},

	    {GpuCounter::Tiles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_PTILES"); }},
	    {GpuCounter::TransactionEliminations, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_TRANS_ELIM"); }},
	    {GpuCounter::EarlyZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_TEST"); }},
	    {GpuCounter::EarlyZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_KILL"); }},
	    {GpuCounter::LateZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_LZS_TEST"); }},
	    {GpuCounter::LateZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_LZS_KILL"); }},

	    {GpuCounter::Instructions, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_COUNT"); }},
	    {GpuCounter::DivergedInstructions, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_DIVERGED"); }},

	    {GpuCounter::ShaderCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_CORE_ACTIVE"); }},
	    {GpuCounter::ShaderArithmeticCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_COUNT"); }},
	    {GpuCounter::ShaderLoadStoreCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_READ_FULL") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_WRITE_FULL") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_READ_SHORT") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_WRITE_SHORT") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_ATOMIC"); }},
	    {GpuCounter::ShaderTextureCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "TEX_FILT_NUM_OPERATIONS"); }},

	    {GpuCounter::CacheReadLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_READ_LOOKUP"); }},
	    {GpuCounter::CacheWriteLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_WRITE_LOOKUP"); }},
	    {GpuCounter::ExternalMemoryReadAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ"); }},
	    {GpuCounter::ExternalMemoryWriteAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE"); }},
	    {GpuCounter::ExternalMemoryReadStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_AR_STALL"); }},
	    {GpuCounter::ExternalMemoryWriteStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_W_STALL"); }},
	    {GpuCounter::ExternalMemoryReadBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ_BEATS") * 16; }},
	    {GpuCounter::ExternalMemoryWriteBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE_BEATS") * 16; }},
	};

	const std::unordered_map<GpuCounter, MaliValueGetter, GpuCounterHash> midgard_mappings = {
	    {GpuCounter::GpuCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "GPU_ACTIVE"); }},
	    {GpuCounter::VertexComputeCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS1_ACTIVE"); }},
	    {GpuCounter::FragmentCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_ACTIVE"); }},

	    {GpuCounter::VertexComputeJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS1_JOBS"); }},
	    {GpuCounter::FragmentJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_JOBS"); }},
	    {GpuCounter::Pixels, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_TASKS") * 1024; }},

	    {GpuCounter::Tiles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_PTILES"); }},
	    {GpuCounter::TransactionEliminations, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_TRANS_ELIM"); }},
	    {GpuCounter::EarlyZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_TEST"); }},
	    {GpuCounter::EarlyZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_KILLED"); }},
	    {GpuCounter::LateZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_THREADS_LZS_TEST"); }},
	    {GpuCounter::LateZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_THREADS_LZS_KILLED"); }},

	    {GpuCounter::CacheReadLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_READ_LOOKUP"); }},
	    {GpuCounter::CacheWriteLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_WRITE_LOOKUP"); }},
	    {GpuCounter::ExternalMemoryReadAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ"); }},
	    {GpuCounter::ExternalMemoryWriteAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE"); }},
	    {GpuCounter::ExternalMemoryReadStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_AR_STALL"); }},
	    {GpuCounter::ExternalMemoryWriteStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_W_STALL"); }},
	    {GpuCounter::ExternalMemoryReadBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ_BEATS") * 16; }},
	    {GpuCounter::ExternalMemoryWriteBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE_BEATS") * 16; }},
	};

	auto product = std::find_if(std::begin(mali_userspace::products), std::end(mali_userspace::products), [&](const mali_userspace::CounterMapping &cm) {
		return (cm.product_mask & gpu_id_) == cm.product_id;
	});

	if (product != std::end(mali_userspace::products))
	{
		switch (product->product_id)
		{
			case mali_userspace::PRODUCT_ID_T60X:
			case mali_userspace::PRODUCT_ID_T62X:
			case mali_userspace::PRODUCT_ID_T72X:
				mappings_                     = midgard_mappings;
				mappings_[GpuCounter::Pixels] = [this]() { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_TASKS") * 256; };
				break;
			case mali_userspace::PRODUCT_ID_T76X:
			case mali_userspace::PRODUCT_ID_T82X:
			case mali_userspace::PRODUCT_ID_T83X:
			case mali_userspace::PRODUCT_ID_T86X:
			case mali_userspace::PRODUCT_ID_TFRX:
				mappings_ = midgard_mappings;
				break;
			case mali_userspace::PRODUCT_ID_TMIX:
			case mali_userspace::PRODUCT_ID_THEX:
				mappings_                                  = bifrost_mappings;
				mappings_[GpuCounter::ShaderTextureCycles] = [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "TEX_COORD_ISSUE"); };
			case mali_userspace::PRODUCT_ID_TSIX:
			case mali_userspace::PRODUCT_ID_TNOX:
			case mali_userspace::PRODUCT_ID_TGOX:
			case mali_userspace::PRODUCT_ID_TDVX:
				mappings_ = bifrost_mappings;
			case mali_userspace::PRODUCT_ID_TNAXa:
			case mali_userspace::PRODUCT_ID_TNAXb:
			case mali_userspace::PRODUCT_ID_TTRX:
			default:
				mappings_ = valhall_mappings;
				break;
		}
	}
	else
	{
		HWCPIPE_LOG("Mali counters initialization failed: Failed to identify GPU");
	}
}

MaliProfilerMagma::~MaliProfilerMagma()
{
	if (buffer_)
		magma_release_buffer(connection_, buffer_);
	if (pool_)
		magma_connection_release_performance_counter_buffer_pool(connection_, pool_);
	if (connection_)
		magma_release_connection(connection_);
	if (device_)
		magma_device_release(device_);
}

void MaliProfilerMagma::init()
{
	MaliHWInfo hw_info;
	for (auto &p : std::filesystem::directory_iterator("/dev/class/gpu"))
	{
		zx::channel server_end, client_end;
		zx_status_t zx_status = zx::channel::create(0, &server_end, &client_end);
		if (zx_status != ZX_OK)
		{
			throw std::runtime_error("Failed to create zx channel");
		}
		zx_status = fdio_service_connect(p.path().c_str(), server_end.release());
		if (zx_status != ZX_OK)
		{
			throw std::runtime_error("Failed to connect to device");
		}

		magma_device_t device;
		magma_status_t status = magma_device_import(client_end.release(), &device);
		if (status != MAGMA_STATUS_OK)
		{
			throw std::runtime_error("Failed to find magma device.");
		}
		uint64_t vendor_id = 0;
		status             = magma_query2(device, MAGMA_QUERY_VENDOR_ID, &vendor_id);
		if (status != MAGMA_STATUS_OK)
		{
			throw std::runtime_error("Failed to query vendor id");
		}
		if (vendor_id != MAGMA_VENDOR_ID_MALI)
		{
			magma_device_release(device);
			continue;
		}
		device_ = device;
		break;
	}

	if (!device_)
	{
		throw std::runtime_error("Didn't find valid mali device.");
	}

	memset(&hw_info, 0, sizeof(hw_info));
	uint64_t       device_id = 0;
	magma_status_t status    = magma_query2(device_, MAGMA_QUERY_DEVICE_ID, &device_id);
	if (status != MAGMA_STATUS_OK)
	{
		throw std::runtime_error("Querying device ID failed.");
	}
	hw_info.gpu_id  = extract_bits(device_id, 16, 16);
	hw_info.r_value = extract_bits(device_id, 12, 4);
	hw_info.p_value = extract_bits(device_id, 4, 8);
	uint64_t shader_mask;
	status = magma_query2(device_, kMsdArmVendorQueryShaderPresent, &shader_mask);
	if (status != MAGMA_STATUS_OK)
	{
		throw std::runtime_error("Querying shader present failed.");
	}
	hw_info.core_mask = shader_mask;
	hw_info.mp_count  = __builtin_popcountll(hw_info.core_mask);
	uint64_t mem_features;
	status = magma_query2(device_, kMsdArmVendorQueryMemoryFeatures, &mem_features);
	if (status != MAGMA_STATUS_OK)
	{
		throw std::runtime_error("Querying memory features failed.");
	}
	hw_info.l2_slices = extract_bits(mem_features, 8, 5) + 1;

	num_cores_     = hw_info.mp_count;
	num_l2_slices_ = hw_info.l2_slices;
	gpu_id_        = hw_info.gpu_id;

	status = magma_create_connection2(device_, &connection_);
	if (status != MAGMA_STATUS_OK)
	{
		throw std::runtime_error("Creatng magma connection failed.");
	}

	bool success = false;
	for (auto &p : std::filesystem::directory_iterator("/dev/class/gpu-performance-counters"))
	{
		zx::channel server_end, client_end;
		zx::channel::create(0, &server_end, &client_end);

		zx_status_t zx_status = fdio_service_connect(p.path().c_str(), server_end.release());
		if (zx_status != ZX_OK)
		{
			throw std::runtime_error("Failed to connect to GPU perf count access service\n");
		}
		magma_status_t status =
		    magma_connection_access_performance_counters(connection_, client_end.release());
		if (status == MAGMA_STATUS_OK)
		{
			success = true;
		}
	}
	if (!success)
	{
		throw std::runtime_error("Failed to enable perf count access.");
	}

	size_t buffer_size;

	// At the moment we only ever should have 1 read outstanding, so only create one buffer.
	constexpr uint32_t kBufferSize = 4096;
	status                         = magma_create_buffer(connection_, kBufferSize, &buffer_size, &buffer_);
	if (status != MAGMA_STATUS_OK)
	{
		throw std::runtime_error("Create buffer failed.");
	}
	buffer_size_ = buffer_size;

	status = magma_connection_create_performance_counter_buffer_pool(connection_, &pool_, &notification_handle_);
	if (status != MAGMA_STATUS_OK)
	{
		throw std::runtime_error("Create performance counter buffer pool failed.");
	}
	magma_buffer_offset offset;
	offset.buffer_id = magma_get_buffer_id(buffer_);
	offset.offset    = 0;
	offset.length    = 4096;
	status           = magma_connection_add_performance_counter_buffer_offsets_to_pool(connection_, pool_, &offset, 1);
	if (status != MAGMA_STATUS_OK)
	{
		throw std::runtime_error("Add performance counters failed.");
	}

	uint64_t vector = 1;
	status          = magma_connection_enable_performance_counters(connection_, &vector, 1);
	if (status != MAGMA_STATUS_OK)
	{
		throw std::runtime_error("Enable performance counters failed.");
	}

	auto product = std::find_if(std::begin(mali_userspace::products), std::end(mali_userspace::products), [&](const mali_userspace::CounterMapping &cm) {
		return (cm.product_mask & hw_info.gpu_id) == cm.product_id;
	});

	if (product != std::end(mali_userspace::products))
	{
		names_lut_ = product->names_lut;
	}
	else
	{
		throw std::runtime_error("Could not identify GPU.");
	}

	raw_counter_buffer_.resize(buffer_size_ / sizeof(uint32_t));

	// Build core remap table.
	core_index_remap_.clear();
	core_index_remap_.reserve(hw_info.mp_count);

	unsigned int mask = hw_info.core_mask;

	while (mask != 0)
	{
		unsigned int bit = __builtin_ctz(mask);
		core_index_remap_.push_back(bit);
		mask &= ~(1u << bit);
	}
}

void MaliProfilerMagma::run()
{
	sample_counters();
	wait_next_event();
}

void MaliProfilerMagma::stop()
{
	// We don't need to do anything on stop()
}

const GpuMeasurements &MaliProfilerMagma::sample()
{
	sample_counters();
	wait_next_event();

	for (const auto &counter : enabled_counters_)
	{
		auto mapping = mappings_.find(counter);
		if (mapping == mappings_.end())
		{
			continue;
		}

		measurements_[mapping->first] = mapping->second();
	}

	return measurements_;
}

void MaliProfilerMagma::sample_counters()
{
	magma_status_t status = magma_connection_dump_performance_counters(connection_, pool_, 1);
	if (status != MAGMA_STATUS_OK)
	{
		throw std::runtime_error("Dump performance counters failed.");
	}
}

void MaliProfilerMagma::wait_next_event()
{
	magma_poll_item_t poll_item{};
	poll_item.type        = MAGMA_POLL_TYPE_HANDLE;
	poll_item.condition   = MAGMA_POLL_CONDITION_READABLE;
	poll_item.handle      = notification_handle_;
	magma_status_t status = magma_poll(&poll_item, 1, INT64_MAX);
	if (status != MAGMA_STATUS_OK)
	{
		throw std::runtime_error("Poll for performance counters failed.");
	}
	uint32_t trigger_id;
	uint64_t buffer_id;
	uint32_t buffer_offset;
	uint64_t time;
	uint32_t result_flags;
	status = magma_connection_read_performance_counter_completion(
	    connection_, pool_, &trigger_id, &buffer_id, &buffer_offset,
	    &time, &result_flags);
	if (status != MAGMA_STATUS_OK)
	{
		throw std::runtime_error("Read performance counters failed.");
	}
	void *data;
	status = magma_map(connection_, buffer_, &data);
	if (status != MAGMA_STATUS_OK)
	{
		throw std::runtime_error("Mapping performance counters failed.");
	}
	memcpy(raw_counter_buffer_.data(), data, 4096);
	timestamp_ = time;
	magma_buffer_offset offset;
	offset.buffer_id = magma_get_buffer_id(buffer_);
	offset.offset    = 0;
	offset.length    = 4096;
	status           = magma_connection_add_performance_counter_buffer_offsets_to_pool(connection_, pool_, &offset, 1);
	if (status != MAGMA_STATUS_OK)
	{
		throw std::runtime_error("Add performance counters failed.");
	}
}

uint64_t MaliProfilerMagma::get_counter_value(mali_userspace::MaliCounterBlockName block, const char *name) const
{
	uint64_t sum = 0;
	switch (block)
	{
		case mali_userspace::MALI_NAME_BLOCK_MMU:
			// If an MMU counter is selected, sum the values over MMU slices
			for (int i = 0; i < num_l2_slices_; i++)
			{
				sum += get_counters(block, i)[find_counter_index_by_name(block, name)];
			}
			return sum;

		case mali_userspace::MALI_NAME_BLOCK_SHADER:
			// If a shader core counter is selected, sum the values over shader cores
			for (int i = 0; i < num_cores_; i++)
			{
				sum += get_counters(block, i)[find_counter_index_by_name(block, name)];
			}
			return sum;

		case mali_userspace::MALI_NAME_BLOCK_JM:
		case mali_userspace::MALI_NAME_BLOCK_TILER:
		default:
			return static_cast<uint64_t>(get_counters(block)[find_counter_index_by_name(block, name)]);
	}
}

const uint32_t *MaliProfilerMagma::get_counters(mali_userspace::MaliCounterBlockName block, int index) const
{
	switch (block)
	{
		case mali_userspace::MALI_NAME_BLOCK_JM:
			return raw_counter_buffer_.data() + mali_userspace::MALI_NAME_BLOCK_SIZE * 0;
		case mali_userspace::MALI_NAME_BLOCK_MMU:
			if (index < 0 || index >= num_l2_slices_)
			{
				throw std::runtime_error("Invalid slice number.");
			}

			// If an MMU counter is selected, index refers to the MMU slice
			return raw_counter_buffer_.data() + mali_userspace::MALI_NAME_BLOCK_SIZE * (2 + index);
		case mali_userspace::MALI_NAME_BLOCK_TILER:
			return raw_counter_buffer_.data() + mali_userspace::MALI_NAME_BLOCK_SIZE * 1;
		default:
			if (index < 0 || index >= num_cores_)
			{
				throw std::runtime_error("Invalid core number.");
			}

			// If a shader core counter is selected, index refers to the core index
			return raw_counter_buffer_.data() + mali_userspace::MALI_NAME_BLOCK_SIZE * (2 + num_l2_slices_ + core_index_remap_[index]);
	}
}

int MaliProfilerMagma::find_counter_index_by_name(mali_userspace::MaliCounterBlockName block, const char *name) const
{
	const char *const *names = &names_lut_[mali_userspace::MALI_NAME_BLOCK_SIZE * block];

	for (int i = 0; i < mali_userspace::MALI_NAME_BLOCK_SIZE; ++i)
	{
		if (strstr(names[i], name) != nullptr)
		{
			return i;
		}
	}

	return -1;
}

}        // namespace hwcpipe
