external/vulkancts/modules/vulkan/compute/vktComputeCooperativeMatrixTests.cpp - third_party/vulkan-cts - Git at Google

 /*------------------------------------------------------------------------
  * Vulkan Conformance Tests
  * ------------------------
  *
  * Copyright (c) 2019 The Khronos Group Inc.
  * Copyright (c) 2018-2019 NVIDIA Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *	  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
  *//*!
  * \file
  * \brief Vulkan Cooperative Matrix tests
  *//*--------------------------------------------------------------------*/

 #include "vktComputeCooperativeMatrixTests.hpp"

 #include "vkBufferWithMemory.hpp"
 #include "vkImageWithMemory.hpp"
 #include "vkQueryUtil.hpp"
 #include "vkBuilderUtil.hpp"
 #include "vkCmdUtil.hpp"
 #include "vkTypeUtil.hpp"
 #include "vktTestGroupUtil.hpp"
 #include "vktTestCase.hpp"

 #include "deDefs.h"
 #include "deFloat16.h"
 #include "deMath.h"
 #include "deRandom.h"
 #include "deSharedPtr.hpp"
 #include "deString.h"

 #include "tcuTestCase.hpp"
 #include "tcuTestLog.hpp"

 #include <string>
 #include <sstream>

 namespace vkt
 {
 namespace compute
 {
 namespace
 {
 using namespace vk;
 using namespace std;

 typedef enum
 {
 	TT_LENGTH = 0,
 	TT_CONSTANT,
 	TT_FCONVERT,
 	TT_COMPOSITE,
 	TT_COMPOSITE_RVALUE,
 	TT_FADD,
 	TT_FSUB,
 	TT_FDIV,
 	TT_FNEGATE,
 	TT_MATRIXTIMESSCALAR,
 	TT_FUNC,
 	TT_MATRIXMULADD,
 	TT_COMPOSITE_ARRAY,
 	TT_MATRIXMULADD_ARRAY,
 } TestType;

 typedef enum
 {
 	SIZE_8x8 = 0,
 	SIZE_16x8,
 	SIZE_16x16,
 	SIZE_16x8x8,
 	SIZE_16x8x16,
 } SizeType;

 typedef enum
 {
 	SC_BUFFER = 0,
 	SC_WORKGROUP,
 	SC_WORKGROUP_VARIABLE_POINTERS,
 	SC_BUFFER_VARIABLE_POINTERS,
 	SC_PHYSICAL_STORAGE_BUFFER,
 } StorageClass;

 const VkFlags allShaderStages = VK_SHADER_STAGE_COMPUTE_BIT;

 struct CaseDef
 {
 	TestType testType;
 	// When testing a multiply, MxNxK is the type of matrix multiply.
 	// Otherwise, MxN is the size of the input/output matrices
 	deUint32 M;
 	deUint32 N;
 	deUint32 K;
 	deUint32 subgroupsPerWorkgroupX;
 	deUint32 subgroupsPerWorkgroupY;
 	deUint32 workgroupsX;
 	deUint32 workgroupsY;
 	VkComponentTypeNV inputType;
 	VkComponentTypeNV outputType;
 	bool colMajor;
 	StorageClass storageClass;
 };

 class CooperativeMatrixTestInstance : public TestInstance
 {
 public:
 						CooperativeMatrixTestInstance	(Context& context, const CaseDef& data);
 						~CooperativeMatrixTestInstance	(void);
 	tcu::TestStatus		iterate				(void);
 private:
 	CaseDef			m_data;
 };

 CooperativeMatrixTestInstance::CooperativeMatrixTestInstance (Context& context, const CaseDef& data)
 	: vkt::TestInstance		(context)
 	, m_data				(data)
 {
 }

 CooperativeMatrixTestInstance::~CooperativeMatrixTestInstance (void)
 {
 }

 class CooperativeMatrixTestCase : public TestCase
 {
 	public:
 								CooperativeMatrixTestCase		(tcu::TestContext& context, const char* name, const char* desc, const CaseDef data);
 								~CooperativeMatrixTestCase	(void);
 	virtual	void				initPrograms		(SourceCollections& programCollection) const;
 	virtual TestInstance*		createInstance		(Context& context) const;
 	virtual void				checkSupport		(Context& context) const;

 private:
 	CaseDef					m_data;
 };

 CooperativeMatrixTestCase::CooperativeMatrixTestCase (tcu::TestContext& context, const char* name, const char* desc, const CaseDef data)
 	: vkt::TestCase	(context, name, desc)
 	, m_data		(data)
 {
 }

 CooperativeMatrixTestCase::~CooperativeMatrixTestCase	(void)
 {
 }

 void CooperativeMatrixTestCase::checkSupport(Context& context) const
 {
 	if (!context.contextSupports(vk::ApiVersion(1, 1, 0)))
 	{
 		TCU_THROW(NotSupportedError, "Vulkan 1.1 not supported");
 	}

 	if (!context.getCooperativeMatrixFeatures().cooperativeMatrix)
 	{
 		TCU_THROW(NotSupportedError, "cooperativeMatrix not supported");
 	}

 	if (!context.getVulkanMemoryModelFeatures().vulkanMemoryModel)
 	{
 		TCU_THROW(NotSupportedError, "vulkanMemoryModel not supported");
 	}

 	if ((m_data.storageClass == SC_WORKGROUP_VARIABLE_POINTERS || m_data.storageClass == SC_BUFFER_VARIABLE_POINTERS) &&
 		!context.getVariablePointerFeatures().variablePointers)
 	{
 		TCU_THROW(NotSupportedError, "variable pointers not supported");
 	}

 	if (m_data.storageClass == SC_PHYSICAL_STORAGE_BUFFER &&
 		!context.getBufferDeviceAddressFeatures().bufferDeviceAddress)
 	{
 		TCU_THROW(NotSupportedError, "buffer device address not supported");
 	}

 	if (!context.getFloat16Int8Features().shaderFloat16 &&
 		(m_data.inputType == VK_COMPONENT_TYPE_FLOAT16_NV || m_data.outputType == VK_COMPONENT_TYPE_FLOAT16_NV))
 	{
 		TCU_THROW(NotSupportedError, "shaderFloat16 not supported");
 	}

 	deUint32 propertyCount = 0;
 	VkCooperativeMatrixPropertiesNV *pProperties;
 	context.getInstanceInterface().getPhysicalDeviceCooperativeMatrixPropertiesNV(context.getPhysicalDevice(), &propertyCount, DE_NULL);
 	if (propertyCount == 0)
 		TCU_THROW(NotSupportedError, "cooperative matrices not supported");

 	bool supported[2] = { false, false };
 	pProperties = new VkCooperativeMatrixPropertiesNV[propertyCount];

 	for (deUint32 i = 0; i < propertyCount; ++i)
 	{
 		VkCooperativeMatrixPropertiesNV *p = &pProperties[i];
 		p->sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV;
 		p->pNext = DE_NULL;
 	}

 	context.getInstanceInterface().getPhysicalDeviceCooperativeMatrixPropertiesNV(context.getPhysicalDevice(), &propertyCount, pProperties);

 	for (deUint32 i = 0; i < propertyCount; ++i)
 	{
 		VkCooperativeMatrixPropertiesNV *p = &pProperties[i];
 		if (m_data.testType == TT_MATRIXMULADD ||
 			m_data.testType == TT_MATRIXMULADD_ARRAY)
 		{
 			if (p->MSize == m_data.M &&
 				p->NSize == m_data.N &&
 				p->KSize == m_data.K &&
 				p->AType == m_data.inputType &&
 				p->BType == m_data.inputType &&
 				p->CType == m_data.outputType &&
 				p->DType == m_data.outputType &&
 				p->scope == VK_SCOPE_SUBGROUP_NV)
 			{
 				supported[0] = supported[1] = true;
 			}
 		}
 		else
 		{
 			VkComponentTypeNV types[2] = { m_data.inputType, m_data.outputType };

 			for (deUint32 j = 0; j < 2; ++j)
 			{
 				// For these tests, m_data.M/N are always the matrix size. Check if they match
 				// any input or output in the list.
 				if ((p->scope == VK_SCOPE_SUBGROUP_NV && p->MSize == m_data.M && p->NSize == m_data.N && (p->CType == types[j] || p->DType == types[j])) ||
 					(p->scope == VK_SCOPE_SUBGROUP_NV && p->MSize == m_data.M && p->KSize == m_data.N && p->AType == types[j]) ||
 					(p->scope == VK_SCOPE_SUBGROUP_NV && p->KSize == m_data.M && p->NSize == m_data.N && p->BType == types[j]))
 				{
 					supported[j] = true;
 				}
 			}
 		}
 	}

 	delete [] pProperties;

 	if (!supported[0] || !supported[1])
 		TCU_THROW(NotSupportedError, "cooperative matrix combination not supported");
 }


 void CooperativeMatrixTestCase::initPrograms (SourceCollections& programCollection) const
 {
 	std::stringstream css;
 	css << "#version 450 core\n";
 	css << "#pragma use_vulkan_memory_model\n";
 	css <<
 		"#extension GL_KHR_shader_subgroup_basic : enable\n"
 		"#extension GL_KHR_memory_scope_semantics : enable\n"
 		"#extension GL_NV_cooperative_matrix : enable\n"
 		"#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable\n"
 		"#extension GL_EXT_buffer_reference : enable\n"
 		"// strides overriden by spec constants\n"
 		"layout(constant_id = 2) const int AStride = 1;\n"
 		"layout(constant_id = 3) const int BStride = 1;\n"
 		"layout(constant_id = 4) const int CStride = 1;\n"
 		"layout(constant_id = 5) const int OStride = 1;\n"
 		"layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z = 1) in;\n";

 	if (m_data.storageClass == SC_BUFFER_VARIABLE_POINTERS || m_data.storageClass == SC_WORKGROUP_VARIABLE_POINTERS)
 		css << "#pragma use_variable_pointers\n";

 	struct
 	{
 		deUint32 rows, cols;
 	} dims[4];

 	if (m_data.testType == TT_MATRIXMULADD ||
 		m_data.testType == TT_MATRIXMULADD_ARRAY)
 	{
 		dims[0].rows = m_data.M;
 		dims[0].cols = m_data.K;
 		dims[1].rows = m_data.K;
 		dims[1].cols = m_data.N;
 		dims[2].rows = m_data.M;
 		dims[2].cols = m_data.N;
 		dims[3].rows = m_data.M;
 		dims[3].cols = m_data.N;
 	}
 	else
 	{
 		dims[0].rows = m_data.M;
 		dims[0].cols = m_data.N;
 		dims[1].rows = m_data.M;
 		dims[1].cols = m_data.N;
 		dims[2].rows = m_data.M;
 		dims[2].cols = m_data.N;
 		dims[3].rows = m_data.M;
 		dims[3].cols = m_data.N;
 	}

 	const char *typeStrA = m_data.inputType == VK_COMPONENT_TYPE_FLOAT16_NV ? "float16_t" : "float";
 	const char *typeStrB = m_data.inputType == VK_COMPONENT_TYPE_FLOAT16_NV ? "float16_t" : "float";
 	const char *typeStrC = m_data.outputType == VK_COMPONENT_TYPE_FLOAT16_NV ? "float16_t" : "float";
 	const char *typeStrO = m_data.outputType == VK_COMPONENT_TYPE_FLOAT16_NV ? "float16_t" : "float";

 	css << "const uvec2 subgroupsPerWG = uvec2(" << m_data.subgroupsPerWorkgroupX << ", " << m_data.subgroupsPerWorkgroupY << ");\n";

 	if (m_data.storageClass == SC_PHYSICAL_STORAGE_BUFFER)
 	{
 		css << "layout(buffer_reference) buffer InputA { " << typeStrA << " x[]; };\n";
 		css << "layout(buffer_reference) buffer InputB { " << typeStrB << " x[]; };\n";
 		css << "layout(buffer_reference) buffer InputC { " << typeStrC << " x[]; };\n";
 		css << "layout(buffer_reference) buffer Output { " << typeStrO << " x[]; };\n";
 		css << "layout(set=0, binding=4) buffer Params { InputA inputA; InputB inputB; InputC inputC; Output outputO; } params;\n";
 	}
 	else
 	{
 		css << "layout(set=0, binding=0) coherent buffer InputA { " << typeStrA << " x[]; } inputA;\n";
 		css << "layout(set=0, binding=1) coherent buffer InputB { " << typeStrB << " x[]; } inputB;\n";
 		css << "layout(set=0, binding=2) coherent buffer InputC { " << typeStrC << " x[]; } inputC;\n";
 		css << "layout(set=0, binding=3) coherent buffer Output { " << typeStrO << " x[]; } outputO;\n";
 	}

 	if (m_data.storageClass == SC_WORKGROUP || m_data.storageClass == SC_WORKGROUP_VARIABLE_POINTERS)
 	{
 		css << "shared " << typeStrA << " sharedA[" << dims[0].rows * dims[0].cols << " * subgroupsPerWG.x * subgroupsPerWG.y];\n";
 		css << "shared " << typeStrB << " sharedB[" << dims[1].rows * dims[1].cols << " * subgroupsPerWG.x * subgroupsPerWG.y];\n";
 		css << "shared " << typeStrC << " sharedC[" << dims[2].rows * dims[2].cols << " * subgroupsPerWG.x * subgroupsPerWG.y];\n";
 		css << "shared " << typeStrO << " sharedO[" << dims[3].rows * dims[3].cols << " * subgroupsPerWG.x * subgroupsPerWG.y];\n";
 	}

 	std::stringstream matAType, matBType, matCType, outputMatType;

 	matAType  << "fcoopmatNV<" << ((m_data.inputType == VK_COMPONENT_TYPE_FLOAT16_NV)   ? 16 : 32) << ", gl_ScopeSubgroup, " << dims[0].rows << ", " << dims[0].cols << ">";
 	matBType  << "fcoopmatNV<" << ((m_data.inputType == VK_COMPONENT_TYPE_FLOAT16_NV)   ? 16 : 32) << ", gl_ScopeSubgroup, " << dims[1].rows << ", " << dims[1].cols << ">";
 	matCType  << "fcoopmatNV<" << ((m_data.outputType == VK_COMPONENT_TYPE_FLOAT16_NV)   ? 16 : 32) << ", gl_ScopeSubgroup, " << dims[2].rows << ", " << dims[2].cols << ">";
 	outputMatType << "fcoopmatNV<" << ((m_data.outputType == VK_COMPONENT_TYPE_FLOAT16_NV)  ? 16 : 32) << ", gl_ScopeSubgroup, " << dims[3].rows << ", " << dims[3].cols << ">";

 	css << matAType.str() << " matA;\n";
 	css << matBType.str() << " matB;\n";
 	css << matCType.str() << " matC;\n";
 	css << outputMatType.str() << " matO;\n";

 	if (m_data.testType == TT_CONSTANT)
 		css << "const " << outputMatType.str() << " matConst = " << outputMatType.str() << "(1.0);\n";

 	if (m_data.testType == TT_FUNC)
 		css << matAType.str() << " f(" << matAType.str() << " m) { return -m; }\n";

 	css <<
 		"void main()\n"
 		"{\n"
 		// matrixID is the x,y index of the matrix owned by this subgroup.
 		"   uvec2 subgroupXY = uvec2(gl_SubgroupID % subgroupsPerWG.x, gl_SubgroupID / subgroupsPerWG.x);\n"
 		"   uvec2 matrixID = uvec2(gl_WorkGroupID.xy) * subgroupsPerWG + subgroupXY;\n";

 	if (m_data.storageClass == SC_PHYSICAL_STORAGE_BUFFER)
 	{
 		css << "   InputA inputA = params.inputA;\n";
 		css << "   InputB inputB = params.inputB;\n";
 		css << "   InputC inputC = params.inputC;\n";
 		css << "   Output outputO = params.outputO;\n";
 	}

 	deUint32 strides[4]; // in elements
 	for (deUint32 i = 0; i < 4; ++i)
 	{
 		strides[i] = (m_data.colMajor ? dims[i].rows : dims[i].cols) * m_data.subgroupsPerWorkgroupX * m_data.workgroupsX;
 	}

 	// element<i> is the starting element in buffer memory.
 	// elementS<i> is the starting element in shared memory.
 	css << "   uint element0 = " << strides[0] * (m_data.colMajor ? dims[0].cols : dims[0].rows) << " * matrixID.y + " << (m_data.colMajor ? dims[0].rows : dims[0].cols) << " * matrixID.x;\n"
 		   "   uint element1 = " << strides[1] * (m_data.colMajor ? dims[1].cols : dims[1].rows) << " * matrixID.y + " << (m_data.colMajor ? dims[1].rows : dims[1].cols) << " * matrixID.x;\n"
 		   "   uint element2 = " << strides[2] * (m_data.colMajor ? dims[2].cols : dims[2].rows) << " * matrixID.y + " << (m_data.colMajor ? dims[2].rows : dims[2].cols) << " * matrixID.x;\n"
 		   "   uint element3 = " << strides[3] * (m_data.colMajor ? dims[3].cols : dims[3].rows) << " * matrixID.y + " << (m_data.colMajor ? dims[3].rows : dims[3].cols) << " * matrixID.x;\n"
 		   "   uint elementS0, elementS1, elementS2, elementS3;\n";

 	// For shared memory tests, copy the matrix from buffer memory into
 	// workgroup memory. For simplicity, do it all on a single thread.
 	if (m_data.storageClass == SC_WORKGROUP || m_data.storageClass == SC_WORKGROUP_VARIABLE_POINTERS)
 	{
 		const char *name[] =
 		{
 			"sharedA",
 			"sharedB",
 			"sharedC",
 		};
 		const char *inputName[] =
 		{
 			"inputA",
 			"inputB",
 			"inputC",
 		};
 		for (deUint32 m = 0; m < 4; ++m)
 		{
 			deUint32 sharedStride = strides[m] / m_data.workgroupsX;
 			css << "       elementS" << m << " = " << sharedStride * (m_data.colMajor ? dims[m].cols : dims[m].rows) << " * subgroupXY.y + " << (m_data.colMajor ? dims[m].rows : dims[m].cols) << " * subgroupXY.x;\n";
 		}
 		css << "   if (subgroupElect()) {\n";
 		// copy all three input buffers.
 		for (deUint32 m = 0; m < 3; ++m)
 		{
 			deUint32 sharedStride = strides[m] / m_data.workgroupsX;
 			css <<  "       for (int i = 0; i < " << dims[m].rows << "; ++i) {\n"
 					"       for (int j = 0; j < " << dims[m].cols << "; ++j) {\n"
 					"           int localElementInput = " << strides[m] << " * " << (m_data.colMajor ? "j" : "i") << " + " << (m_data.colMajor ? "i" : "j") << ";\n"
 					"           int localElementShared = " << sharedStride << " * " << (m_data.colMajor ? "j" : "i") << " + " << (m_data.colMajor ? "i" : "j") << ";\n"
 					"           " << name[m] << "[elementS" << m << " + localElementShared] = " << inputName[m] << ".x[element" << m << " + localElementInput];\n"
 					"       }\n"
 					"       }\n";
 			strides[m] = sharedStride;
 		}
 		css << "   }\n";
 		css << "   controlBarrier(gl_ScopeSubgroup, gl_ScopeSubgroup, gl_StorageSemanticsShared, gl_SemanticsAcquireRelease);\n";
 	}

 	const char *colMajor = (m_data.colMajor ? "true" : "false");

 	if (m_data.storageClass == SC_WORKGROUP || m_data.storageClass == SC_WORKGROUP_VARIABLE_POINTERS)
 	{
 		css <<  "   coopMatLoadNV(matA, sharedA, elementS0, " << strides[0] << ", " << colMajor << ");\n"
 				"   coopMatLoadNV(matB, sharedB, elementS1, " << strides[1] << ", " << colMajor << ");\n"
 				"   coopMatLoadNV(matC, sharedC, elementS2, " << strides[2] << ", " << colMajor << ");\n";
 	}
 	else
 	{
 		css << "   coopMatLoadNV(matA, inputA.x, element0, " << strides[0] << ", " << colMajor << ");\n"
 			   "   coopMatLoadNV(matB, inputB.x, element1, " << strides[1] << ", " << colMajor << ");\n"
 			   "   coopMatLoadNV(matC, inputC.x, element2, " << strides[2] << ", " << colMajor << ");\n";
 	}

 	if (m_data.testType == TT_COMPOSITE_ARRAY ||
 		m_data.testType == TT_MATRIXMULADD_ARRAY)
 	{
 		css << "   " << matAType.str() << " matAArr[2];\n    matAArr[1] = matA; matAArr[0] = " << matAType.str() << "(0.0);\n"
 			   "   " << matBType.str() << " matBArr[2];\n    matBArr[1] = matB; matBArr[0] = " << matBType.str() << "(0.0);\n"
 			   "   " << matCType.str() << " matCArr[2];\n    matCArr[1] = matC; matCArr[0] = " << matCType.str() << "(0.0);\n"
 			   "   " << outputMatType.str() << " matOArr[2];\n";
 	}

 	switch (m_data.testType)
 	{
 	default:
 		DE_ASSERT(0);
 		// fall through
 	case TT_LENGTH:
 		css << "   matO = " << outputMatType.str() << "(matO.length());\n";
 		break;
 	case TT_CONSTANT:
 		css << "   matO = matConst;\n";
 		break;
 	case TT_FCONVERT:
 		css << "   matO = " << outputMatType.str() << "(matA);\n";
 		break;
 	case TT_COMPOSITE:
 	case TT_COMPOSITE_RVALUE:
 		css << "   for (int i = 0; i < matA.length(); ++i) {\n"
 			   "       matO[i] = matA[i] + matB[i];\n"
 			   "   }\n";
 		if (m_data.testType == TT_COMPOSITE_RVALUE)
 		{
 			css << "   " << matAType.str() << " t = matA;\n"
 				   "   matO[0] = (t += matB)[0];\n"
 				   "   if (matA.length() > 0) {\n"
 				   "       t = matA;\n"
 				   "       matO[1] = (t += matB)[1];\n"
 				   "   }\n";
 		}
 		break;
 	case TT_COMPOSITE_ARRAY:
 		css << "   for (int i = 0; i < matA.length(); ++i) {\n"
 			   "       matOArr[1][i] = matAArr[1][i] + matBArr[1][i];\n"
 			   "   }\n";
 		break;
 	case TT_FADD:
 		css << "   matO = matA + matB;\n";
 		break;
 	case TT_FSUB:
 		css << "   matO = matA - matB;\n";
 		break;
 	case TT_FDIV:
 		css << "   matO = matA / matB;\n";
 		break;
 	case TT_FNEGATE:
 		css << "   matO = -matA;\n";
 		break;
 	case TT_FUNC:
 		css << "   matO = f(matA);\n";
 		break;
 	case TT_MATRIXTIMESSCALAR:
 		css << "   matO = (" << typeStrA << "(2.0)*matA)*" << typeStrA << "(3.0);\n";
 		break;
 	case TT_MATRIXMULADD:
 		css << "   matO = coopMatMulAddNV(matA, matB, matC);\n";
 		break;
 	case TT_MATRIXMULADD_ARRAY:
 		css << "   matOArr[1] = coopMatMulAddNV(matAArr[1], matBArr[1], matCArr[1]);\n";
 		break;
 	}

 	if (m_data.testType == TT_COMPOSITE_ARRAY ||
 		m_data.testType == TT_MATRIXMULADD_ARRAY)
 	{
 		css << "   matOArr[0] = " << outputMatType.str() << "(0.0);\n";
 		css << "   matO = matOArr[1];\n";
 	}

 	if (m_data.storageClass == SC_WORKGROUP || m_data.storageClass == SC_WORKGROUP_VARIABLE_POINTERS)
 	{
 		deUint32 sharedStride = strides[3] / m_data.workgroupsX;
 		css << "   coopMatStoreNV(matO, sharedO, elementS3, " << sharedStride << ", " << colMajor << ");\n";
 		css << "   controlBarrier(gl_ScopeSubgroup, gl_ScopeSubgroup, gl_StorageSemanticsShared, gl_SemanticsAcquireRelease);\n";
 		css << "   if (subgroupElect()) {\n";
 		css << "       for (int i = 0; i < " << dims[3].rows << "; ++i) {\n"
 			   "       for (int j = 0; j < " << dims[3].cols << "; ++j) {\n"
 			   "           int localElementInput = " << strides[3] << " * " << (m_data.colMajor ? "j" : "i") << " + " << (m_data.colMajor ? "i" : "j") << ";\n"
 			   "           int localElementShared = " << sharedStride << " * " << (m_data.colMajor ? "j" : "i") << " + " << (m_data.colMajor ? "i" : "j") << ";\n"
 			   "           outputO.x[element3 + localElementInput] = sharedO[elementS3 + localElementShared];\n"
 			   "       }\n"
 			   "       }\n";
 		css << "   }\n";
 	}
 	else
 	{
 		css << "   coopMatStoreNV(matO, outputO.x, element3, " << strides[3] << ", " << colMajor << ");\n";
 	}

 	css <<
 		"}\n";

 	const vk::ShaderBuildOptions	buildOptions	(programCollection.usedVulkanVersion, vk::SPIRV_VERSION_1_3, 0u);

 	programCollection.glslSources.add("test") << glu::ComputeSource(css.str()) << buildOptions;
 }

 TestInstance* CooperativeMatrixTestCase::createInstance (Context& context) const
 {
 	return new CooperativeMatrixTestInstance(context, m_data);
 }

 VkBufferCreateInfo makeBufferCreateInfo (const VkDeviceSize			bufferSize,
 										 const VkBufferUsageFlags	usage)
 {
 	const VkBufferCreateInfo bufferCreateInfo =
 	{
 		VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,	// VkStructureType		sType;
 		DE_NULL,								// const void*			pNext;
 		(VkBufferCreateFlags)0,					// VkBufferCreateFlags	flags;
 		bufferSize,								// VkDeviceSize			size;
 		usage,									// VkBufferUsageFlags	usage;
 		VK_SHARING_MODE_EXCLUSIVE,				// VkSharingMode		sharingMode;
 		0u,										// deUint32				queueFamilyIndexCount;
 		DE_NULL,								// const deUint32*		pQueueFamilyIndices;
 	};
 	return bufferCreateInfo;
 }

 Move<VkDescriptorSet> makeDescriptorSet (const DeviceInterface&			vk,
 										 const VkDevice					device,
 										 const VkDescriptorPool			descriptorPool,
 										 const VkDescriptorSetLayout	setLayout)
 {
 	const VkDescriptorSetAllocateInfo allocateParams =
 	{
 		VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,		// VkStructureType				sType;
 		DE_NULL,											// const void*					pNext;
 		descriptorPool,										// VkDescriptorPool				descriptorPool;
 		1u,													// deUint32						setLayoutCount;
 		&setLayout,											// const VkDescriptorSetLayout*	pSetLayouts;
 	};
 	return allocateDescriptorSet(vk, device, &allocateParams);
 }

 static void setData(void *base, VkComponentTypeNV dt, deUint32 i, float value)
 {
 	if (dt == VK_COMPONENT_TYPE_FLOAT32_NV)
 	{
 		((float *)base)[i] = value;
 	}
 	else
 	{
 		DE_ASSERT(dt == VK_COMPONENT_TYPE_FLOAT16_NV);
 		((deFloat16 *)base)[i] = deFloat32To16(value);
 	}
 }

 static float getData(void *base, VkComponentTypeNV dt, deUint32 i)
 {
 	if (dt == VK_COMPONENT_TYPE_FLOAT32_NV)
 	{
 		return ((float *)base)[i];
 	}
 	else
 	{
 		DE_ASSERT(dt == VK_COMPONENT_TYPE_FLOAT16_NV);
 		return deFloat16To32(((deFloat16 *)base)[i]);
 	}
 }

 tcu::TestStatus CooperativeMatrixTestInstance::iterate (void)
 {
 	const DeviceInterface&	vk						= m_context.getDeviceInterface();
 	const VkDevice			device					= m_context.getDevice();
 	Allocator&				allocator				= m_context.getDefaultAllocator();

 	deRandom rnd;
 	deRandom_init(&rnd, 1234);

 	vk::VkPhysicalDeviceSubgroupProperties subgroupProperties;
 	deMemset(&subgroupProperties, 0, sizeof(subgroupProperties));
 	subgroupProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES;

 	vk::VkPhysicalDeviceProperties2 properties2;
 	deMemset(&properties2, 0, sizeof(properties2));
 	properties2.sType = vk::VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
 	properties2.pNext = &subgroupProperties;

 	m_context.getInstanceInterface().getPhysicalDeviceProperties2(m_context.getPhysicalDevice(), &properties2);

 	struct
 	{
 		deUint32 rows, cols;
 	} dims[4];

 	if (m_data.testType == TT_MATRIXMULADD ||
 		m_data.testType == TT_MATRIXMULADD_ARRAY)
 	{
 		dims[0].rows = m_data.M;
 		dims[0].cols = m_data.K;
 		dims[1].rows = m_data.K;
 		dims[1].cols = m_data.N;
 		dims[2].rows = m_data.M;
 		dims[2].cols = m_data.N;
 		dims[3].rows = m_data.M;
 		dims[3].cols = m_data.N;
 	}
 	else
 	{
 		dims[0].rows = m_data.M;
 		dims[0].cols = m_data.N;
 		dims[1].rows = m_data.M;
 		dims[1].cols = m_data.N;
 		dims[2].rows = m_data.M;
 		dims[2].cols = m_data.N;
 		dims[3].rows = m_data.M;
 		dims[3].cols = m_data.N;
 	}

 	VkComponentTypeNV dataTypes[4];
 	size_t elementSize[4];
 	VkDeviceSize bufferSizes[5];
 	de::MovePtr<BufferWithMemory> buffers[5];
 	vk::VkDescriptorBufferInfo bufferDescriptors[5];
 	deUint32 strides[4]; // in elements
 	deUint32 totalElements[4];

 	for (deUint32 i = 0; i < 5; ++i)
 	{
 		if (i < 4)
 		{
 			// A/B use input type, C/D use output type
 			dataTypes[i] = (i < 2) ? m_data.inputType : m_data.outputType;
 			elementSize[i] = (dataTypes[i] == VK_COMPONENT_TYPE_FLOAT16_NV) ? 2 : 4;

 			strides[i] = (m_data.colMajor ? dims[i].rows : dims[i].cols) * m_data.subgroupsPerWorkgroupX * m_data.workgroupsX;
 			totalElements[i] = strides[i] * (m_data.colMajor ? dims[i].cols : dims[i].rows) * m_data.subgroupsPerWorkgroupY * m_data.workgroupsY;

 			bufferSizes[i] = totalElements[i] * elementSize[i];
 		}
 		else
 		{
 			bufferSizes[4] = sizeof(VkDeviceAddress)*4;
 		}

 		try
 		{
 			buffers[i] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
 				vk, device, allocator, makeBufferCreateInfo(bufferSizes[i], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT|VK_BUFFER_USAGE_TRANSFER_DST_BIT|VK_BUFFER_USAGE_TRANSFER_SRC_BIT|VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_EXT),
 				MemoryRequirement::HostVisible | MemoryRequirement::Cached | MemoryRequirement::Coherent));
 		}
 		catch (const tcu::NotSupportedError&)
 		{
 			buffers[i] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
 				vk, device, allocator, makeBufferCreateInfo(bufferSizes[i], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT|VK_BUFFER_USAGE_TRANSFER_DST_BIT|VK_BUFFER_USAGE_TRANSFER_SRC_BIT|VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_EXT),
 				MemoryRequirement::HostVisible));
 		}

 		bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, bufferSizes[i]);
 	}

 	void *ptrs[5];
 	for (deUint32 i = 0; i < 5; ++i)
 	{
 		ptrs[i] = buffers[i]->getAllocation().getHostPtr();
 	}

 	vk::DescriptorSetLayoutBuilder layoutBuilder;

 	layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, allShaderStages);
 	layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, allShaderStages);
 	layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, allShaderStages);
 	layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, allShaderStages);
 	layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, allShaderStages);

 	vk::Unique<vk::VkDescriptorSetLayout>	descriptorSetLayout(layoutBuilder.build(vk, device));

 	vk::Unique<vk::VkDescriptorPool>		descriptorPool(vk::DescriptorPoolBuilder()
 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 5u)
 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
 	vk::Unique<vk::VkDescriptorSet>			descriptorSet		(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));

 	vk::DescriptorSetUpdateBuilder setUpdateBuilder;
 	if (m_data.storageClass == SC_PHYSICAL_STORAGE_BUFFER)
 	{
 		VkBufferDeviceAddressInfoEXT info =
 		{
 			VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO_EXT,	// VkStructureType	 sType;
 			DE_NULL,											// const void*		 pNext;
 			0,													// VkBuffer			buffer
 		};
 		VkDeviceAddress *addrsInMemory = (VkDeviceAddress *)ptrs[4];
 		for (deUint32 i = 0; i < 4; ++i)
 		{
 			info.buffer = **buffers[i];
 			VkDeviceAddress addr = vk.getBufferDeviceAddressEXT(device, &info);
 			addrsInMemory[i] = addr;
 		}
 		setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(4),
 			VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[4]);
 	}
 	else
 	{
 		setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(0),
 			VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[0]);
 		setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(1),
 			VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[1]);
 		setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(2),
 			VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[2]);
 		setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(3),
 			VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[3]);
 	}

 	setUpdateBuilder.update(vk, device);

 	const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo =
 	{
 		VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,				// sType
 		DE_NULL,													// pNext
 		(VkPipelineLayoutCreateFlags)0,
 		1,															// setLayoutCount
 		&descriptorSetLayout.get(),									// pSetLayouts
 		0u,															// pushConstantRangeCount
 		DE_NULL,													// pPushConstantRanges
 	};

 	Move<VkPipelineLayout> pipelineLayout = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);

 	Move<VkPipeline> pipeline;

 	VkPipelineBindPoint bindPoint = VK_PIPELINE_BIND_POINT_COMPUTE;

 	const deUint32 specData[6] =
 	{
 		subgroupProperties.subgroupSize * m_data.subgroupsPerWorkgroupX,
 		m_data.subgroupsPerWorkgroupY,
 		strides[0],
 		strides[1],
 		strides[2],
 		strides[3],
 	};

 	const vk::VkSpecializationMapEntry entries[6] =
 	{
 		{0, (deUint32)(sizeof(deUint32) * 0), sizeof(deUint32)},
 		{1, (deUint32)(sizeof(deUint32) * 1), sizeof(deUint32)},
 		{2, (deUint32)(sizeof(deUint32) * 2), sizeof(deUint32)},
 		{3, (deUint32)(sizeof(deUint32) * 3), sizeof(deUint32)},
 		{4, (deUint32)(sizeof(deUint32) * 4), sizeof(deUint32)},
 		{5, (deUint32)(sizeof(deUint32) * 5), sizeof(deUint32)},
 	};

 	const vk::VkSpecializationInfo specInfo =
 	{
 		6,						// mapEntryCount
 		entries,				// pMapEntries
 		sizeof(specData),		// dataSize
 		specData				// pData
 	};

 	for (deUint32 i = 0; i < 4; ++i)
 		for (deUint32 j = 0; j < totalElements[i]; ++j)
 		{
 			if (m_data.testType != TT_MATRIXMULADD &&
 				m_data.testType != TT_MATRIXMULADD_ARRAY)
 				setData(ptrs[i], dataTypes[i], j, ((float)(deRandom_getUint32(&rnd) & 0xff) - 64.0f)/2.0f);
 			else
 				setData(ptrs[i], dataTypes[i], j, ((float)(deRandom_getUint32(&rnd) & 0xf) - 4.0f)/2.0f);
 		}

 	flushAlloc(vk, device, buffers[0]->getAllocation());
 	flushAlloc(vk, device, buffers[1]->getAllocation());
 	flushAlloc(vk, device, buffers[2]->getAllocation());
 	flushAlloc(vk, device, buffers[3]->getAllocation());

 	const Unique<VkShaderModule>	shader						(createShaderModule(vk, device, m_context.getBinaryCollection().get("test"), 0));

 	const VkPipelineShaderStageCreateInfo	shaderCreateInfo =
 	{
 		VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
 		DE_NULL,
 		(VkPipelineShaderStageCreateFlags)0,
 		VK_SHADER_STAGE_COMPUTE_BIT,								// stage
 		*shader,													// shader
 		"main",
 		&specInfo,													// pSpecializationInfo
 	};

 	const VkComputePipelineCreateInfo		pipelineCreateInfo =
 	{
 		VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
 		DE_NULL,
 		0u,															// flags
 		shaderCreateInfo,											// cs
 		*pipelineLayout,											// layout
 		(vk::VkPipeline)0,											// basePipelineHandle
 		0u,															// basePipelineIndex
 	};
 	pipeline = createComputePipeline(vk, device, DE_NULL, &pipelineCreateInfo, NULL);

 	const VkQueue					queue					= m_context.getUniversalQueue();
 	Move<VkCommandPool>				cmdPool					= createCommandPool(vk, device, 0, m_context.getUniversalQueueFamilyIndex());
 	Move<VkCommandBuffer>			cmdBuffer				= allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);

 	beginCommandBuffer(vk, *cmdBuffer, 0u);

 	vk.cmdBindDescriptorSets(*cmdBuffer, bindPoint, *pipelineLayout, 0u, 1, &*descriptorSet, 0u, DE_NULL);
 	vk.cmdBindPipeline(*cmdBuffer, bindPoint, *pipeline);

 	vk.cmdDispatch(*cmdBuffer, m_data.workgroupsX, m_data.workgroupsY, 1);

 	endCommandBuffer(vk, *cmdBuffer);

 	submitCommandsAndWait(vk, device, queue, cmdBuffer.get());

 	invalidateAlloc(vk, device, buffers[3]->getAllocation());
 	qpTestResult res = QP_TEST_RESULT_PASS;

 	if (m_data.testType != TT_MATRIXMULADD &&
 		m_data.testType != TT_MATRIXMULADD_ARRAY)
 	{
 		for (deUint32 i = 0; i < totalElements[3]; ++i)
 		{
 			float inputA = getData(ptrs[0], dataTypes[0], i);
 			float inputB = getData(ptrs[1], dataTypes[1], i);
 			float output = getData(ptrs[3], dataTypes[3], i);
 			switch (m_data.testType)
 			{
 			case TT_LENGTH:
 				if (output < 1.0f || output > (float)(m_data.N*m_data.M))
 					res = QP_TEST_RESULT_FAIL;
 				// We expect the matrix to be spread evenly across invocations, it is
 				// surprising (but not necessarily illegal) if not
 				if (output != (float)(m_data.N*m_data.M/subgroupProperties.subgroupSize) &&
 					res == QP_TEST_RESULT_PASS)
 					res = QP_TEST_RESULT_QUALITY_WARNING;
 				break;
 			case TT_CONSTANT:
 				if (output != 1.0f)
 					res = QP_TEST_RESULT_FAIL;
 				break;
 			case TT_FCONVERT:
 				if (output != inputA)
 					res = QP_TEST_RESULT_FAIL;
 				break;
 			case TT_COMPOSITE:
 			case TT_COMPOSITE_RVALUE:
 			case TT_COMPOSITE_ARRAY:
 			case TT_FADD:
 				if (output != inputA + inputB)
 					res = QP_TEST_RESULT_FAIL;
 				break;
 			case TT_FSUB:
 				if (output != inputA - inputB)
 					res = QP_TEST_RESULT_FAIL;
 				break;
 			case TT_FDIV:
 				{
 					float ulp = (m_data.inputType == VK_COMPONENT_TYPE_FLOAT16_NV) ? 1.0f/1024.0f : 1.0f/(8.0f*1024.0f*1024.0f);
 					// division allows 2.5ulp, but we'll use 3.
 					ulp *= 3;
 					if (inputB != 0 && fabs(output - inputA / inputB) > ulp * fabs(inputA / inputB))
 						res = QP_TEST_RESULT_FAIL;
 				}
 				break;
 			case TT_FNEGATE:
 			case TT_FUNC:
 				if (output != -inputA)
 					res = QP_TEST_RESULT_FAIL;
 				break;
 			case TT_MATRIXTIMESSCALAR:
 				if (output != 6.0*inputA)
 					res = QP_TEST_RESULT_FAIL;
 				break;
 			default:
 				break;
 			}
 		}
 	}
 	else
 	{
 		deUint32 ik, kj, ij;
 		for (deUint32 mX = 0; mX < m_data.subgroupsPerWorkgroupX*m_data.workgroupsX; ++mX)
 		{
 			for (deUint32 mY = 0; mY < m_data.subgroupsPerWorkgroupY*m_data.workgroupsY; ++mY)
 			{
 				for (deUint32 i = 0; i < m_data.M; ++i)
 				{
 					for (deUint32 j = 0; j < m_data.N; ++j)
 					{
 						float ref = 0;
 						for (deUint32 k = 0; k < m_data.K; ++k)
 						{
 							if (m_data.colMajor)
 								ik = mX * m_data.M + i + strides[0] * (mY * m_data.K + k);
 							else
 								ik = mX * m_data.K + k + strides[0] * (mY * m_data.M + i);

 							float Aik = getData(ptrs[0], dataTypes[0], ik);

 							if (m_data.colMajor)
 								kj = mX * m_data.K + k + strides[1] * (mY * m_data.N + j);
 							else
 								kj = mX * m_data.N + j + strides[1] * (mY * m_data.K + k);

 							float Bkj = getData(ptrs[1], dataTypes[1], kj);

 							ref += Aik*Bkj;
 						}

 						if (m_data.colMajor)
 							ij = mX * m_data.M + i + strides[2] * (mY * m_data.N + j);
 						else
 							ij = mX * m_data.N + j + strides[2] * (mY * m_data.M + i);

 						float Cij = getData(ptrs[2], dataTypes[2], ij);

 						ref += Cij;

 						float Dij = getData(ptrs[3], dataTypes[3], ij);

 						if (ref != Dij)
 						{
 							res = QP_TEST_RESULT_FAIL;
 						}
 					}
 				}
 			}
 		}
 	}

 	return tcu::TestStatus(res, qpGetTestResultName(res));
 }

 }	// anonymous

 tcu::TestCaseGroup*	createCooperativeMatrixTests (tcu::TestContext& testCtx)
 {
 	de::MovePtr<tcu::TestCaseGroup> group(new tcu::TestCaseGroup(
 			testCtx, "cooperative_matrix", "GL_NV_cooperative_matrix tests"));

 	typedef struct
 	{
 		deUint32				value;
 		const char*				name;
 		const char*				description;
 	} TestGroupCase;

 	TestGroupCase ttCases[] =
 	{
 		{ TT_LENGTH,				"length",					"OpCooperativeMatrixLengthNV"	},
 		{ TT_CONSTANT,				"constant",					"OpConstantComposite"			},
 		{ TT_FCONVERT,				"fconvert",					"OpFConvert"					},
 		{ TT_COMPOSITE,				"composite",				"OpCompositeConstruct"			},
 		{ TT_COMPOSITE_RVALUE,		"composite_rvalue",			"OpCompositeExtract"			},
 		{ TT_FADD,					"fadd",						"OpFAdd"						},
 		{ TT_FSUB,					"fsub",						"OpFSub"						},
 		{ TT_FDIV,					"fdiv",						"OpFDiv"						},
 		{ TT_FNEGATE,				"fnegate",					"OpFNegate"						},
 		{ TT_MATRIXTIMESSCALAR,		"matrixtimesscalar",		"OpMatrixTimesScalar"			},
 		{ TT_FUNC,					"func",						"OpFunctionParameter"			},
 		{ TT_MATRIXMULADD,			"matrixmuladd",				"OpCooperativeMatrixMulAddNV"	},
 		{ TT_COMPOSITE_ARRAY,		"composite_array",			"OpCompositeConstruct w/array"			},
 		{ TT_MATRIXMULADD_ARRAY,	"matrixmuladd_array",		"OpCooperativeMatrixMulAddNV w/array"	},
 	};

 	TestGroupCase dtCases[] =
 	{
 		{ VK_COMPONENT_TYPE_FLOAT32_NV,	"float32",	"C/D type are fp32"		},
 		{ VK_COMPONENT_TYPE_FLOAT16_NV,	"float16",	"C/D type are fp16"		},
 	};

 	TestGroupCase stCases[] =
 	{
 		{ SIZE_8x8,					"8x8",					"8x8 component-wise"			},
 		{ SIZE_16x8,				"16x8",					"16x8 component-wise"			},
 		{ SIZE_16x16,				"16x16",				"16x16 component-wise"			},
 		{ SIZE_16x8x8,				"16x8x8",				"16x8x8 matrix multiple"		},
 		{ SIZE_16x8x16,				"16x8x16",				"16x8x16 matrix multiple"		},
 	};

 	TestGroupCase colCases[] =
 	{
 		{ 0,		"rowmajor",	"row major"		},
 		{ 1,		"colmajor",	"col major"		},
 	};

 	TestGroupCase scCases[] =
 	{
 		{ SC_BUFFER,						"buffer",			"SSBO"				},
 		{ SC_WORKGROUP,						"workgroup",		"shared memory"		},
 		{ SC_BUFFER_VARIABLE_POINTERS,		"buffer_varptr",	"SSBO w/variable pointers"		},
 		{ SC_WORKGROUP_VARIABLE_POINTERS,	"workgroup_varptr",	"shared memory w/variable pointers"		},
 		{ SC_PHYSICAL_STORAGE_BUFFER,		"physical_buffer",	"physical_storage_buffer"				},
 	};

 	const deUint32 M[] =
 	{
 		8u,
 		16u,
 		16u,
 		16u,
 		16u,
 	};
 	const deUint32 N[] =
 	{
 		8u,
 		8u,
 		16u,
 		8u,
 		8u,
 	};
 	const deUint32 K[] =
 	{
 		0u,
 		0u,
 		0u,
 		8u,
 		16u,
 	};

 	for (int ttNdx = 0; ttNdx < DE_LENGTH_OF_ARRAY(ttCases); ttNdx++)
 	{
 		de::MovePtr<tcu::TestCaseGroup> ttGroup(new tcu::TestCaseGroup(testCtx, ttCases[ttNdx].name, ttCases[ttNdx].description));
 		for (int dtNdx = 0; dtNdx < DE_LENGTH_OF_ARRAY(dtCases); dtNdx++)
 		{
 			de::MovePtr<tcu::TestCaseGroup> dtGroup(new tcu::TestCaseGroup(testCtx, dtCases[dtNdx].name, dtCases[dtNdx].description));
 			for (int stNdx = 0; stNdx < DE_LENGTH_OF_ARRAY(stCases); stNdx++)
 			{
 				de::MovePtr<tcu::TestCaseGroup> stGroup(new tcu::TestCaseGroup(testCtx, stCases[stNdx].name, stCases[stNdx].description));
 				for (int scNdx = 0; scNdx < DE_LENGTH_OF_ARRAY(scCases); scNdx++)
 				{
 					de::MovePtr<tcu::TestCaseGroup> scGroup(new tcu::TestCaseGroup(testCtx, scCases[scNdx].name, scCases[scNdx].description));
 					for (int colNdx = 0; colNdx < DE_LENGTH_OF_ARRAY(colCases); colNdx++)
 					{
 						TestType testType = (TestType)ttCases[ttNdx].value;
 						VkComponentTypeNV inputType = (VkComponentTypeNV)dtCases[dtNdx].value;
 						VkComponentTypeNV outputType = (VkComponentTypeNV)dtCases[dtNdx].value;

 						bool isMatrixMul = testType == TT_MATRIXMULADD || testType == TT_MATRIXMULADD_ARRAY;

 						if (isMatrixMul)
 							inputType = VK_COMPONENT_TYPE_FLOAT16_NV;

 						if (testType == TT_FCONVERT)
 						{
 							if (inputType == VK_COMPONENT_TYPE_FLOAT32_NV)
 								outputType = VK_COMPONENT_TYPE_FLOAT16_NV;
 							else
 								outputType = VK_COMPONENT_TYPE_FLOAT32_NV;
 						}

 						SizeType sizeType = (SizeType)stCases[stNdx].value;
 						CaseDef c =
 						{
 							testType,							// TestType testtype;
 							M[sizeType],						// deUint32 M;
 							N[sizeType],						// deUint32 N;
 							K[sizeType],						// deUint32 K;
 							2u,									// deUint32 subgroupsPerWorkgroupX;
 							2u,									// deUint32 subgroupsPerWorkgroupY;
 							4u,									// deUint32 workgroupsX;
 							4u,									// deUint32 workgroupsY;
 							(VkComponentTypeNV)inputType,		// VkComponentTypeNV inputType;
 							(VkComponentTypeNV)outputType,		// VkComponentTypeNV outputType;
 							!!colCases[colNdx].value,			// bool colMajor;
 							(StorageClass)scCases[scNdx].value,	// StorageClass storageClass;
 						};

 						if (isMatrixMul != (sizeType == SIZE_16x8x8 || sizeType == SIZE_16x8x16))
 							continue;

 						scGroup->addChild(new CooperativeMatrixTestCase(testCtx, colCases[colNdx].name, colCases[colNdx].description, c));
 					}
 					stGroup->addChild(scGroup.release());
 				}
 				dtGroup->addChild(stGroup.release());
 			}
 			ttGroup->addChild(dtGroup.release());
 		}
 		group->addChild(ttGroup.release());
 	}
 	return group.release();
 }

 }	// compute
 }	// vkt