Add Magma support

Copy mali_profiler.h/cpp into new files and convert them to use the
magma performance counter API.

Change-Id: If8b8556468d063e89a1fa00d79c94f49ddd899ab
Reviewed-on: https://fuchsia-review.googlesource.com/c/third_party/github.com/ARM-software/HWCPipe/+/413156
Reviewed-by: Craig Stout <cstout@google.com>
diff --git a/BUILD.gn b/BUILD.gn
new file mode 100644
index 0000000..8a099cf
--- /dev/null
+++ b/BUILD.gn
@@ -0,0 +1,34 @@
+# Copyright 2020 The Fuchsia Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+config("hwcpipe_config") {
+  include_dirs = [ "." ]
+  defines = [ "HWCPIPE_NO_JSON" ]
+}
+
+source_set("mali_profiler") {
+  sources = [
+    "cpu_profiler.h",
+    "gpu_profiler.h",
+    "hwcpipe.cpp",
+    "hwcpipe.h",
+    "vendor/arm/mali/hwc.hpp",
+    "vendor/arm/mali/hwc_names.hpp",
+    "vendor/arm/mali/mali_profiler_magma.cpp",
+    "vendor/arm/mali/mali_profiler_magma.h",
+  ]
+  public_configs = [ ":hwcpipe_config" ]
+  configs -= [ "//build/config:no_exceptions" ]
+
+  # Disable ShadowCallStack, since there seem to be some issues with the call stack after exceptions
+  # are caught.
+  # TODO(fxb/41627): Re-enable.
+  cflags = [ "-fno-sanitize=shadow-call-stack" ]
+  deps = [
+    "//sdk/lib/fdio",
+    "//src/graphics/drivers/msd-arm-mali/include",
+    "//src/graphics/lib/magma/src/libmagma",
+    "//zircon/public/lib/zx",
+  ]
+}
diff --git a/hwcpipe.cpp b/hwcpipe.cpp
index 2d3ba19..5e22372 100644
--- a/hwcpipe.cpp
+++ b/hwcpipe.cpp
@@ -30,8 +30,12 @@
 #	include "vendor/arm/mali/mali_profiler.h"
 #endif
 
+#ifdef __Fuchsia__
+#	include "vendor/arm/mali/mali_profiler_magma.h"
+#endif
+
 #ifndef HWCPIPE_NO_JSON
-#include <json.hpp>
+#	include <json.hpp>
 using json = nlohmann::json;
 #endif
 
@@ -189,6 +193,15 @@
 	{
 		HWCPIPE_LOG("Mali profiler initialization failed: %s", e.what());
 	}
+#elif defined(__Fuchsia__)
+	try
+	{
+		gpu_profiler_ = std::unique_ptr<MaliProfilerMagma>(new MaliProfilerMagma(enabled_gpu_counters));
+	}
+	catch (const std::runtime_error &e)
+	{
+		HWCPIPE_LOG("Mali profiler initialization failed: %s", e.what());
+	}
 #else
 	HWCPIPE_LOG("No counters available for this platform.");
 #endif
diff --git a/vendor/arm/mali/hwc_names.hpp b/vendor/arm/mali/hwc_names.hpp
index 84a6e3f..348ecd8 100644
--- a/vendor/arm/mali/hwc_names.hpp
+++ b/vendor/arm/mali/hwc_names.hpp
@@ -4411,6 +4411,11 @@
             PRODUCT_ID_TTRX,
             hardware_counters_mali_tTRx,
         },
+        {
+            PRODUCT_ID_MASK_NEW,
+            PRODUCT_ID_TGOX,
+            hardware_counters_mali_tGOx,
+        },
     };
 
 enum
diff --git a/vendor/arm/mali/mali_profiler_magma.cpp b/vendor/arm/mali/mali_profiler_magma.cpp
new file mode 100644
index 0000000..4839e64
--- /dev/null
+++ b/vendor/arm/mali/mali_profiler_magma.cpp
@@ -0,0 +1,539 @@
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "mali_profiler_magma.h"
+
+#include "hwcpipe_log.h"
+
+#include "magma.h"
+#include "magma_arm_mali_types.h"
+#include "magma_vendor_queries.h"
+#include <algorithm>
+#include <stdexcept>
+
+#include <filesystem>
+#include <lib/fdio/directory.h>
+#include <lib/zx/channel.h>
+
+using mali_userspace::MALI_NAME_BLOCK_JM;
+using mali_userspace::MALI_NAME_BLOCK_MMU;
+using mali_userspace::MALI_NAME_BLOCK_SHADER;
+using mali_userspace::MALI_NAME_BLOCK_TILER;
+
+namespace hwcpipe
+{
+namespace
+{
+struct MaliHWInfo
+{
+	unsigned mp_count;
+	unsigned gpu_id;
+	unsigned r_value;
+	unsigned p_value;
+	unsigned core_mask;
+	unsigned l2_slices;
+};
+
+static uint32_t extract_bits(uint64_t input, uint32_t shift, uint32_t width)
+{
+	return (input >> shift) & ((1 << width) - 1);
+}
+}        // namespace
+
+typedef std::function<uint64_t(void)> MaliValueGetter;
+
+MaliProfilerMagma::MaliProfilerMagma(const GpuCounterSet &enabled_counters) :
+    enabled_counters_(enabled_counters)
+{
+	// Throws if setup fails
+	init();
+
+	const std::unordered_map<GpuCounter, MaliValueGetter, GpuCounterHash> valhall_mappings = {
+	    {GpuCounter::GpuCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "GPU_ACTIVE"); }},
+	    {GpuCounter::VertexComputeCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS1_ACTIVE"); }},
+	    {GpuCounter::FragmentCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_ACTIVE"); }},
+	    {GpuCounter::TilerCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_TILER, "TILER_ACTIVE"); }},
+
+	    {GpuCounter::VertexComputeJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS1_JOBS"); }},
+	    {GpuCounter::FragmentJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_JOBS"); }},
+	    {GpuCounter::Pixels, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_TASKS") * 1024; }},
+
+	    {GpuCounter::Tiles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_PTILES"); }},
+	    {GpuCounter::TransactionEliminations, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_TRANS_ELIM"); }},
+	    {GpuCounter::EarlyZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_TEST"); }},
+	    {GpuCounter::EarlyZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_KILL"); }},
+	    {GpuCounter::LateZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_LZS_TEST"); }},
+	    {GpuCounter::LateZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_LZS_KILL"); }},
+
+	    {GpuCounter::Instructions, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_FMA") + get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_CVT") + get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_SFU") + get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_MSG"); }},
+	    {GpuCounter::DivergedInstructions, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_DIVERGED"); }},
+
+	    {GpuCounter::ShaderCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_CORE_ACTIVE"); }},
+	    // The three units run in parallel so we can approximate cycles by taking the largest value. SFU instructions use 4 cycles per warp.
+	    {GpuCounter::ShaderArithmeticCycles, [this] { return std::max(get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_FMA"), std::max(get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_CVT"), 4 * get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_SFU"))); }},
+	    {GpuCounter::ShaderLoadStoreCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_READ_FULL") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_WRITE_FULL") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_READ_SHORT") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_WRITE_SHORT") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_ATOMIC"); }},
+	    {GpuCounter::ShaderTextureCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "TEX_FILT_NUM_OPERATIONS"); }},
+
+	    {GpuCounter::CacheReadLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_READ_LOOKUP"); }},
+	    {GpuCounter::CacheWriteLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_WRITE_LOOKUP"); }},
+	    {GpuCounter::ExternalMemoryReadAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ"); }},
+	    {GpuCounter::ExternalMemoryWriteAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE"); }},
+	    {GpuCounter::ExternalMemoryReadStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_AR_STALL"); }},
+	    {GpuCounter::ExternalMemoryWriteStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_W_STALL"); }},
+	    {GpuCounter::ExternalMemoryReadBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ_BEATS") * 16; }},
+	    {GpuCounter::ExternalMemoryWriteBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE_BEATS") * 16; }},
+	};
+
+	const std::unordered_map<GpuCounter, MaliValueGetter, GpuCounterHash> bifrost_mappings = {
+	    {GpuCounter::GpuCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "GPU_ACTIVE"); }},
+	    {GpuCounter::VertexComputeCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS1_ACTIVE"); }},
+	    {GpuCounter::FragmentCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_ACTIVE"); }},
+	    {GpuCounter::TilerCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_TILER, "TILER_ACTIVE"); }},
+
+	    {GpuCounter::VertexComputeJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS1_JOBS"); }},
+	    {GpuCounter::FragmentJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_JOBS"); }},
+	    {GpuCounter::Pixels, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_TASKS") * 1024; }},
+
+	    {GpuCounter::Tiles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_PTILES"); }},
+	    {GpuCounter::TransactionEliminations, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_TRANS_ELIM"); }},
+	    {GpuCounter::EarlyZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_TEST"); }},
+	    {GpuCounter::EarlyZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_KILL"); }},
+	    {GpuCounter::LateZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_LZS_TEST"); }},
+	    {GpuCounter::LateZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_LZS_KILL"); }},
+
+	    {GpuCounter::Instructions, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_COUNT"); }},
+	    {GpuCounter::DivergedInstructions, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_DIVERGED"); }},
+
+	    {GpuCounter::ShaderCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_CORE_ACTIVE"); }},
+	    {GpuCounter::ShaderArithmeticCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_COUNT"); }},
+	    {GpuCounter::ShaderLoadStoreCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_READ_FULL") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_WRITE_FULL") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_READ_SHORT") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_WRITE_SHORT") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_ATOMIC"); }},
+	    {GpuCounter::ShaderTextureCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "TEX_FILT_NUM_OPERATIONS"); }},
+
+	    {GpuCounter::CacheReadLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_READ_LOOKUP"); }},
+	    {GpuCounter::CacheWriteLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_WRITE_LOOKUP"); }},
+	    {GpuCounter::ExternalMemoryReadAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ"); }},
+	    {GpuCounter::ExternalMemoryWriteAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE"); }},
+	    {GpuCounter::ExternalMemoryReadStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_AR_STALL"); }},
+	    {GpuCounter::ExternalMemoryWriteStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_W_STALL"); }},
+	    {GpuCounter::ExternalMemoryReadBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ_BEATS") * 16; }},
+	    {GpuCounter::ExternalMemoryWriteBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE_BEATS") * 16; }},
+	};
+
+	const std::unordered_map<GpuCounter, MaliValueGetter, GpuCounterHash> midgard_mappings = {
+	    {GpuCounter::GpuCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "GPU_ACTIVE"); }},
+	    {GpuCounter::VertexComputeCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS1_ACTIVE"); }},
+	    {GpuCounter::FragmentCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_ACTIVE"); }},
+
+	    {GpuCounter::VertexComputeJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS1_JOBS"); }},
+	    {GpuCounter::FragmentJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_JOBS"); }},
+	    {GpuCounter::Pixels, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_TASKS") * 1024; }},
+
+	    {GpuCounter::Tiles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_PTILES"); }},
+	    {GpuCounter::TransactionEliminations, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_TRANS_ELIM"); }},
+	    {GpuCounter::EarlyZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_TEST"); }},
+	    {GpuCounter::EarlyZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_KILLED"); }},
+	    {GpuCounter::LateZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_THREADS_LZS_TEST"); }},
+	    {GpuCounter::LateZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_THREADS_LZS_KILLED"); }},
+
+	    {GpuCounter::CacheReadLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_READ_LOOKUP"); }},
+	    {GpuCounter::CacheWriteLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_WRITE_LOOKUP"); }},
+	    {GpuCounter::ExternalMemoryReadAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ"); }},
+	    {GpuCounter::ExternalMemoryWriteAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE"); }},
+	    {GpuCounter::ExternalMemoryReadStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_AR_STALL"); }},
+	    {GpuCounter::ExternalMemoryWriteStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_W_STALL"); }},
+	    {GpuCounter::ExternalMemoryReadBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ_BEATS") * 16; }},
+	    {GpuCounter::ExternalMemoryWriteBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE_BEATS") * 16; }},
+	};
+
+	auto product = std::find_if(std::begin(mali_userspace::products), std::end(mali_userspace::products), [&](const mali_userspace::CounterMapping &cm) {
+		return (cm.product_mask & gpu_id_) == cm.product_id;
+	});
+
+	if (product != std::end(mali_userspace::products))
+	{
+		switch (product->product_id)
+		{
+			case mali_userspace::PRODUCT_ID_T60X:
+			case mali_userspace::PRODUCT_ID_T62X:
+			case mali_userspace::PRODUCT_ID_T72X:
+				mappings_                     = midgard_mappings;
+				mappings_[GpuCounter::Pixels] = [this]() { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_TASKS") * 256; };
+				break;
+			case mali_userspace::PRODUCT_ID_T76X:
+			case mali_userspace::PRODUCT_ID_T82X:
+			case mali_userspace::PRODUCT_ID_T83X:
+			case mali_userspace::PRODUCT_ID_T86X:
+			case mali_userspace::PRODUCT_ID_TFRX:
+				mappings_ = midgard_mappings;
+				break;
+			case mali_userspace::PRODUCT_ID_TMIX:
+			case mali_userspace::PRODUCT_ID_THEX:
+				mappings_                                  = bifrost_mappings;
+				mappings_[GpuCounter::ShaderTextureCycles] = [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "TEX_COORD_ISSUE"); };
+			case mali_userspace::PRODUCT_ID_TSIX:
+			case mali_userspace::PRODUCT_ID_TNOX:
+			case mali_userspace::PRODUCT_ID_TGOX:
+			case mali_userspace::PRODUCT_ID_TDVX:
+				mappings_ = bifrost_mappings;
+			case mali_userspace::PRODUCT_ID_TNAXa:
+			case mali_userspace::PRODUCT_ID_TNAXb:
+			case mali_userspace::PRODUCT_ID_TTRX:
+			default:
+				mappings_ = valhall_mappings;
+				break;
+		}
+	}
+	else
+	{
+		HWCPIPE_LOG("Mali counters initialization failed: Failed to identify GPU");
+	}
+}
+
+MaliProfilerMagma::~MaliProfilerMagma()
+{
+	if (buffer_)
+		magma_release_buffer(connection_, buffer_);
+	if (pool_)
+		magma_connection_release_performance_counter_buffer_pool(connection_, pool_);
+	if (connection_)
+		magma_release_connection(connection_);
+	if (device_)
+		magma_device_release(device_);
+}
+
+void MaliProfilerMagma::init()
+{
+	MaliHWInfo hw_info;
+	for (auto &p : std::filesystem::directory_iterator("/dev/class/gpu"))
+	{
+		zx::channel server_end, client_end;
+		zx_status_t zx_status = zx::channel::create(0, &server_end, &client_end);
+		if (zx_status != ZX_OK)
+		{
+			throw std::runtime_error("Failed to create zx channel");
+		}
+		zx_status = fdio_service_connect(p.path().c_str(), server_end.release());
+		if (zx_status != ZX_OK)
+		{
+			throw std::runtime_error("Failed to connect to device");
+		}
+
+		magma_device_t device;
+		magma_status_t status = magma_device_import(client_end.release(), &device);
+		if (status != MAGMA_STATUS_OK)
+		{
+			throw std::runtime_error("Failed to find magma device.");
+		}
+		uint64_t vendor_id = 0;
+		status             = magma_query2(device, MAGMA_QUERY_VENDOR_ID, &vendor_id);
+		if (status != MAGMA_STATUS_OK)
+		{
+			throw std::runtime_error("Failed to query vendor id");
+		}
+		if (vendor_id != MAGMA_VENDOR_ID_MALI)
+		{
+			magma_device_release(device);
+			continue;
+		}
+		device_ = device;
+		break;
+	}
+
+	if (!device_)
+	{
+		throw std::runtime_error("Didn't find valid mali device.");
+	}
+
+	memset(&hw_info, 0, sizeof(hw_info));
+	uint64_t       device_id = 0;
+	magma_status_t status    = magma_query2(device_, MAGMA_QUERY_DEVICE_ID, &device_id);
+	if (status != MAGMA_STATUS_OK)
+	{
+		throw std::runtime_error("Querying device ID failed.");
+	}
+	hw_info.gpu_id  = extract_bits(device_id, 16, 16);
+	hw_info.r_value = extract_bits(device_id, 12, 4);
+	hw_info.p_value = extract_bits(device_id, 4, 8);
+	uint64_t shader_mask;
+	status = magma_query2(device_, kMsdArmVendorQueryShaderPresent, &shader_mask);
+	if (status != MAGMA_STATUS_OK)
+	{
+		throw std::runtime_error("Querying shader present failed.");
+	}
+	hw_info.core_mask = shader_mask;
+	hw_info.mp_count  = __builtin_popcountll(hw_info.core_mask);
+	uint64_t mem_features;
+	status = magma_query2(device_, kMsdArmVendorQueryMemoryFeatures, &mem_features);
+	if (status != MAGMA_STATUS_OK)
+	{
+		throw std::runtime_error("Querying memory features failed.");
+	}
+	hw_info.l2_slices = extract_bits(mem_features, 8, 5);
+
+	num_cores_     = hw_info.mp_count;
+	num_l2_slices_ = hw_info.l2_slices;
+	gpu_id_        = hw_info.gpu_id;
+
+	status = magma_create_connection2(device_, &connection_);
+	if (status != MAGMA_STATUS_OK)
+	{
+		throw std::runtime_error("Creatng magma connection failed.");
+	}
+
+	bool success = false;
+	for (auto &p : std::filesystem::directory_iterator("/dev/class/gpu-performance-counters"))
+	{
+		zx::channel server_end, client_end;
+		zx::channel::create(0, &server_end, &client_end);
+
+		zx_status_t zx_status = fdio_service_connect(p.path().c_str(), server_end.release());
+		if (zx_status != ZX_OK)
+		{
+			throw std::runtime_error("Failed to connect to GPU perf count access service\n");
+		}
+		magma_status_t status =
+		    magma_connection_access_performance_counters(connection_, client_end.release());
+		if (status == MAGMA_STATUS_OK)
+		{
+			success = true;
+		}
+	}
+	if (!success)
+	{
+		throw std::runtime_error("Failed to enable perf count access.");
+	}
+
+	size_t buffer_size;
+
+	// At the moment we only ever should have 1 read outstanding, so only create one buffer.
+	constexpr uint32_t kBufferSize = 4096;
+	status                         = magma_create_buffer(connection_, kBufferSize, &buffer_size, &buffer_);
+	if (status != MAGMA_STATUS_OK)
+	{
+		throw std::runtime_error("Create buffer failed.");
+	}
+	buffer_size_ = buffer_size;
+
+	status = magma_connection_create_performance_counter_buffer_pool(connection_, &pool_, &notification_handle_);
+	if (status != MAGMA_STATUS_OK)
+	{
+		throw std::runtime_error("Create performance counter buffer pool failed.");
+	}
+	magma_buffer_offset offset;
+	offset.buffer_id = magma_get_buffer_id(buffer_);
+	offset.offset    = 0;
+	offset.length    = 4096;
+	status           = magma_connection_add_performance_counter_buffer_offsets_to_pool(connection_, pool_, &offset, 1);
+	if (status != MAGMA_STATUS_OK)
+	{
+		throw std::runtime_error("Add performance counters failed.");
+	}
+
+	uint64_t vector = 1;
+	status          = magma_connection_enable_performance_counters(connection_, &vector, 1);
+	if (status != MAGMA_STATUS_OK)
+	{
+		throw std::runtime_error("Enable performance counters failed.");
+	}
+
+	auto product = std::find_if(std::begin(mali_userspace::products), std::end(mali_userspace::products), [&](const mali_userspace::CounterMapping &cm) {
+		return (cm.product_mask & hw_info.gpu_id) == cm.product_id;
+	});
+
+	if (product != std::end(mali_userspace::products))
+	{
+		names_lut_ = product->names_lut;
+	}
+	else
+	{
+		throw std::runtime_error("Could not identify GPU.");
+	}
+
+	raw_counter_buffer_.resize(buffer_size_ / sizeof(uint32_t));
+
+	// Build core remap table.
+	core_index_remap_.clear();
+	core_index_remap_.reserve(hw_info.mp_count);
+
+	unsigned int mask = hw_info.core_mask;
+
+	while (mask != 0)
+	{
+		unsigned int bit = __builtin_ctz(mask);
+		core_index_remap_.push_back(bit);
+		mask &= ~(1u << bit);
+	}
+}
+
+void MaliProfilerMagma::run()
+{
+	sample_counters();
+	wait_next_event();
+}
+
+void MaliProfilerMagma::stop()
+{
+	// We don't need to do anything on stop()
+}
+
+const GpuMeasurements &MaliProfilerMagma::sample()
+{
+	sample_counters();
+	wait_next_event();
+
+	for (const auto &counter : enabled_counters_)
+	{
+		auto mapping = mappings_.find(counter);
+		if (mapping == mappings_.end())
+		{
+			continue;
+		}
+
+		measurements_[mapping->first] = mapping->second();
+	}
+
+	return measurements_;
+}
+
+void MaliProfilerMagma::sample_counters()
+{
+	magma_status_t status = magma_connection_dump_performance_counters(connection_, pool_, 1);
+	if (status != MAGMA_STATUS_OK)
+	{
+		throw std::runtime_error("Dump performance counters failed.");
+	}
+}
+
+void MaliProfilerMagma::wait_next_event()
+{
+	magma_poll_item_t poll_item{};
+	poll_item.type        = MAGMA_POLL_TYPE_HANDLE;
+	poll_item.condition   = MAGMA_POLL_CONDITION_READABLE;
+	poll_item.handle      = notification_handle_;
+	magma_status_t status = magma_poll(&poll_item, 1, INT64_MAX);
+	if (status != MAGMA_STATUS_OK)
+	{
+		throw std::runtime_error("Poll for performance counters failed.");
+	}
+	uint32_t trigger_id;
+	uint64_t buffer_id;
+	uint32_t buffer_offset;
+	uint64_t time;
+	uint32_t result_flags;
+	status = magma_connection_read_performance_counter_completion(
+	    connection_, pool_, &trigger_id, &buffer_id, &buffer_offset,
+	    &time, &result_flags);
+	if (status != MAGMA_STATUS_OK)
+	{
+		throw std::runtime_error("Read performance counters failed.");
+	}
+	void *data;
+	status = magma_map(connection_, buffer_, &data);
+	if (status != MAGMA_STATUS_OK)
+	{
+		throw std::runtime_error("Mapping performance counters failed.");
+	}
+	memcpy(raw_counter_buffer_.data(), data, 4096);
+	timestamp_ = time;
+	magma_buffer_offset offset;
+	offset.buffer_id = magma_get_buffer_id(buffer_);
+	offset.offset    = 0;
+	offset.length    = 4096;
+	status           = magma_connection_add_performance_counter_buffer_offsets_to_pool(connection_, pool_, &offset, 1);
+	if (status != MAGMA_STATUS_OK)
+	{
+		throw std::runtime_error("Add performance counters failed.");
+	}
+}
+
+uint64_t MaliProfilerMagma::get_counter_value(mali_userspace::MaliCounterBlockName block, const char *name) const
+{
+	uint64_t sum = 0;
+	switch (block)
+	{
+		case mali_userspace::MALI_NAME_BLOCK_MMU:
+			// If an MMU counter is selected, sum the values over MMU slices
+			for (int i = 0; i < num_l2_slices_; i++)
+			{
+				sum += get_counters(block, i)[find_counter_index_by_name(block, name)];
+			}
+			return sum;
+
+		case mali_userspace::MALI_NAME_BLOCK_SHADER:
+			// If a shader core counter is selected, sum the values over shader cores
+			for (int i = 0; i < num_cores_; i++)
+			{
+				sum += get_counters(block, i)[find_counter_index_by_name(block, name)];
+			}
+			return sum;
+
+		case mali_userspace::MALI_NAME_BLOCK_JM:
+		case mali_userspace::MALI_NAME_BLOCK_TILER:
+		default:
+			return static_cast<uint64_t>(get_counters(block)[find_counter_index_by_name(block, name)]);
+	}
+}
+
+const uint32_t *MaliProfilerMagma::get_counters(mali_userspace::MaliCounterBlockName block, int index) const
+{
+	switch (block)
+	{
+		case mali_userspace::MALI_NAME_BLOCK_JM:
+			return raw_counter_buffer_.data() + mali_userspace::MALI_NAME_BLOCK_SIZE * 0;
+		case mali_userspace::MALI_NAME_BLOCK_MMU:
+			if (index < 0 || index >= num_l2_slices_)
+			{
+				throw std::runtime_error("Invalid slice number.");
+			}
+
+			// If an MMU counter is selected, index refers to the MMU slice
+			return raw_counter_buffer_.data() + mali_userspace::MALI_NAME_BLOCK_SIZE * (2 + index);
+		case mali_userspace::MALI_NAME_BLOCK_TILER:
+			return raw_counter_buffer_.data() + mali_userspace::MALI_NAME_BLOCK_SIZE * 1;
+		default:
+			if (index < 0 || index >= num_cores_)
+			{
+				throw std::runtime_error("Invalid core number.");
+			}
+
+			// If a shader core counter is selected, index refers to the core index
+			return raw_counter_buffer_.data() + mali_userspace::MALI_NAME_BLOCK_SIZE * (2 + num_l2_slices_ + core_index_remap_[index]);
+	}
+}
+
+int MaliProfilerMagma::find_counter_index_by_name(mali_userspace::MaliCounterBlockName block, const char *name) const
+{
+	const char *const *names = &names_lut_[mali_userspace::MALI_NAME_BLOCK_SIZE * block];
+
+	for (int i = 0; i < mali_userspace::MALI_NAME_BLOCK_SIZE; ++i)
+	{
+		if (strstr(names[i], name) != nullptr)
+		{
+			return i;
+		}
+	}
+
+	return -1;
+}
+
+}        // namespace hwcpipe
diff --git a/vendor/arm/mali/mali_profiler_magma.h b/vendor/arm/mali/mali_profiler_magma.h
new file mode 100644
index 0000000..0a604e5
--- /dev/null
+++ b/vendor/arm/mali/mali_profiler_magma.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef THIRD_PARTY_GITHUB_COM_ARM_SOFTWARE_HWCPIPE_VENDOR_ARM_MALI_MALI_PROFILER_MAGMA_H_
+#define THIRD_PARTY_GITHUB_COM_ARM_SOFTWARE_HWCPIPE_VENDOR_ARM_MALI_MALI_PROFILER_MAGMA_H_
+
+#include "gpu_profiler.h"
+
+#include "hwc.hpp"
+
+#include <functional>
+#include <vector>
+
+#include "magma.h"
+
+namespace hwcpipe
+{
+/** A Gpu profiler that uses Mali counter data. */
+class MaliProfilerMagma : public GpuProfiler
+{
+  public:
+	explicit MaliProfilerMagma(const GpuCounterSet &enabled_counters);
+	virtual ~MaliProfilerMagma();
+
+	virtual const GpuCounterSet &enabled_counters() const override
+	{
+		return enabled_counters_;
+	}
+
+	virtual const GpuCounterSet &supported_counters() const override
+	{
+		return supported_counters_;
+	};
+
+	virtual void set_enabled_counters(GpuCounterSet counters) override
+	{
+		enabled_counters_ = std::move(counters);
+	};
+
+	virtual void                   run() override;
+	virtual const GpuMeasurements &sample() override;
+	virtual void                   stop() override;
+
+  private:
+	GpuCounterSet enabled_counters_{};
+
+	const GpuCounterSet supported_counters_{
+	    GpuCounter::GpuCycles,
+	    GpuCounter::VertexComputeCycles,
+	    GpuCounter::FragmentCycles,
+	    GpuCounter::TilerCycles,
+	    GpuCounter::VertexComputeJobs,
+	    GpuCounter::Tiles,
+	    GpuCounter::TransactionEliminations,
+	    GpuCounter::FragmentJobs,
+	    GpuCounter::Pixels,
+	    GpuCounter::EarlyZTests,
+	    GpuCounter::EarlyZKilled,
+	    GpuCounter::LateZTests,
+	    GpuCounter::LateZKilled,
+	    GpuCounter::Instructions,
+	    GpuCounter::DivergedInstructions,
+	    GpuCounter::ShaderCycles,
+	    GpuCounter::ShaderArithmeticCycles,
+	    GpuCounter::ShaderLoadStoreCycles,
+	    GpuCounter::ShaderTextureCycles,
+	    GpuCounter::CacheReadLookups,
+	    GpuCounter::CacheWriteLookups,
+	    GpuCounter::ExternalMemoryReadAccesses,
+	    GpuCounter::ExternalMemoryWriteAccesses,
+	    GpuCounter::ExternalMemoryReadStalls,
+	    GpuCounter::ExternalMemoryWriteStalls,
+	    GpuCounter::ExternalMemoryReadBytes,
+	    GpuCounter::ExternalMemoryWriteBytes,
+	};
+
+	typedef std::function<double(void)>                             MaliValueGetter;
+	std::unordered_map<GpuCounter, MaliValueGetter, GpuCounterHash> mappings_{};
+
+	int                num_cores_{0};
+	int                num_l2_slices_{0};
+	int                gpu_id_{0};
+	size_t             buffer_size_{0};
+	magma_buffer_t     buffer_ = 0;
+	uint64_t           timestamp_{0};
+	const char *const *names_lut_{
+	    nullptr};
+	std::vector<uint32_t>     raw_counter_buffer_{};
+	std::vector<unsigned int> core_index_remap_{};
+	magma_device_t            device_ = 0;
+	magma_connection_t        connection_{};
+	magma_perf_count_pool_t   pool_{};
+	magma_handle_t            notification_handle_ = 0;
+
+	GpuMeasurements measurements_{};
+
+	void            init();
+	void            sample_counters();
+	void            wait_next_event();
+	const uint32_t *get_counters(mali_userspace::MaliCounterBlockName block, int index = 0) const;
+	uint64_t        get_counter_value(mali_userspace::MaliCounterBlockName block, const char *name) const;
+	int             find_counter_index_by_name(mali_userspace::MaliCounterBlockName block, const char *name) const;
+};
+
+}        // namespace hwcpipe
+
+#endif        // THIRD_PARTY_GITHUB_COM_ARM_SOFTWARE_HWCPIPE_VENDOR_ARM_MALI_MALI_PROFILER_MAGMA_H_