external/vulkancts/modules/vulkan/shaderexecutor/vktAtomicOperationTests.cpp - third_party/vulkan-cts - Git at Google

 /*------------------------------------------------------------------------
  * Vulkan Conformance Tests
  * ------------------------
  *
  * Copyright (c) 2015 The Khronos Group Inc.
  * Copyright (c) 2017 Google Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
  *//*!
  * \file
  * \brief Atomic operations (OpAtomic*) tests.
  *//*--------------------------------------------------------------------*/

 #include "vktAtomicOperationTests.hpp"
 #include "vktShaderExecutor.hpp"

 #include "vkRefUtil.hpp"
 #include "vkMemUtil.hpp"
 #include "vkQueryUtil.hpp"
 #include "vkObjUtil.hpp"
 #include "vkBarrierUtil.hpp"
 #include "vkCmdUtil.hpp"
 #include "vktTestGroupUtil.hpp"

 #include "tcuTestLog.hpp"
 #include "tcuStringTemplate.hpp"
 #include "tcuResultCollector.hpp"

 #include "deFloat16.h"
 #include "deMath.hpp"
 #include "deStringUtil.hpp"
 #include "deSharedPtr.hpp"
 #include "deRandom.hpp"
 #include "deArrayUtil.hpp"

 #include <string>
 #include <memory>
 #include <cmath>

 namespace vkt
 {
 namespace shaderexecutor
 {

 namespace
 {

 using de::UniquePtr;
 using de::MovePtr;
 using std::vector;

 using namespace vk;

 enum class AtomicMemoryType
 {
 	BUFFER = 0,	// Normal buffer.
 	SHARED,		// Shared global struct in a compute workgroup.
 	REFERENCE,	// Buffer passed as a reference.
 };

 // Helper struct to indicate the shader type and if it should use shared global memory.
 class AtomicShaderType
 {
 public:
 	AtomicShaderType (glu::ShaderType type, AtomicMemoryType memoryType)
 		: m_type				(type)
 		, m_atomicMemoryType	(memoryType)
 	{
 		// Shared global memory can only be set to true with compute shaders.
 		DE_ASSERT(memoryType != AtomicMemoryType::SHARED || type == glu::SHADERTYPE_COMPUTE);
 	}

 	glu::ShaderType		getType					(void) const	{ return m_type; }
 	AtomicMemoryType	getMemoryType			(void) const	{ return m_atomicMemoryType; }

 private:
 	glu::ShaderType		m_type;
 	AtomicMemoryType	m_atomicMemoryType;
 };

 // Buffer helper
 class Buffer
 {
 public:
 						Buffer				(Context& context, VkBufferUsageFlags usage, size_t size, bool useRef);

 	VkBuffer			getBuffer			(void) const { return *m_buffer;					}
 	void*				getHostPtr			(void) const { return m_allocation->getHostPtr();	}
 	void				flush				(void);
 	void				invalidate			(void);

 private:
 	const DeviceInterface&		m_vkd;
 	const VkDevice				m_device;
 	const VkQueue				m_queue;
 	const deUint32				m_queueIndex;
 	const Unique<VkBuffer>		m_buffer;
 	const UniquePtr<Allocation>	m_allocation;
 };

 typedef de::SharedPtr<Buffer> BufferSp;

 Move<VkBuffer> createBuffer (const DeviceInterface& vkd, VkDevice device, VkDeviceSize size, VkBufferUsageFlags usageFlags)
 {
 	const VkBufferCreateInfo createInfo	=
 	{
 		VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
 		DE_NULL,
 		(VkBufferCreateFlags)0,
 		size,
 		usageFlags,
 		VK_SHARING_MODE_EXCLUSIVE,
 		0u,
 		DE_NULL
 	};
 	return createBuffer(vkd, device, &createInfo);
 }

 MovePtr<Allocation> allocateAndBindMemory (const DeviceInterface& vkd, VkDevice device, Allocator& allocator, VkBuffer buffer, bool useRef)
 {
 	const MemoryRequirement allocationType = (MemoryRequirement::HostVisible | (useRef ? MemoryRequirement::DeviceAddress : MemoryRequirement::Any));
 	MovePtr<Allocation>	alloc(allocator.allocate(getBufferMemoryRequirements(vkd, device, buffer), allocationType));

 	VK_CHECK(vkd.bindBufferMemory(device, buffer, alloc->getMemory(), alloc->getOffset()));

 	return alloc;
 }

 Buffer::Buffer (Context& context, VkBufferUsageFlags usage, size_t size, bool useRef)
 	: m_vkd			(context.getDeviceInterface())
 	, m_device		(context.getDevice())
 	, m_queue		(context.getUniversalQueue())
 	, m_queueIndex	(context.getUniversalQueueFamilyIndex())
 	, m_buffer		(createBuffer			(context.getDeviceInterface(),
 											 context.getDevice(),
 											 (VkDeviceSize)size,
 											 usage))
 	, m_allocation	(allocateAndBindMemory	(context.getDeviceInterface(),
 											 context.getDevice(),
 											 context.getDefaultAllocator(),
 											 *m_buffer,
 											 useRef))
 {
 }

 void Buffer::flush (void)
 {
 	flushMappedMemoryRange(m_vkd, m_device, m_allocation->getMemory(), m_allocation->getOffset(), VK_WHOLE_SIZE);
 }

 void Buffer::invalidate (void)
 {
 	const auto	cmdPool			= vk::makeCommandPool(m_vkd, m_device, m_queueIndex);
 	const auto	cmdBufferPtr	= vk::allocateCommandBuffer(m_vkd, m_device, cmdPool.get(), VK_COMMAND_BUFFER_LEVEL_PRIMARY);
 	const auto	cmdBuffer		= cmdBufferPtr.get();
 	const auto	bufferBarrier	= vk::makeBufferMemoryBarrier(VK_ACCESS_MEMORY_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, m_buffer.get(), 0ull, VK_WHOLE_SIZE);

 	beginCommandBuffer(m_vkd, cmdBuffer);
 	m_vkd.cmdPipelineBarrier(cmdBuffer, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0u, 0u, nullptr, 1u, &bufferBarrier, 0u, nullptr);
 	endCommandBuffer(m_vkd, cmdBuffer);
 	submitCommandsAndWait(m_vkd, m_device, m_queue, cmdBuffer);

 	invalidateMappedMemoryRange(m_vkd, m_device, m_allocation->getMemory(), m_allocation->getOffset(), VK_WHOLE_SIZE);
 }

 // Tests

 enum AtomicOperation
 {
 	ATOMIC_OP_EXCHANGE = 0,
 	ATOMIC_OP_COMP_SWAP,
 	ATOMIC_OP_ADD,
 	ATOMIC_OP_MIN,
 	ATOMIC_OP_MAX,
 	ATOMIC_OP_AND,
 	ATOMIC_OP_OR,
 	ATOMIC_OP_XOR,

 	ATOMIC_OP_LAST
 };

 std::string atomicOp2Str (AtomicOperation op)
 {
 	static const char* const s_names[] =
 	{
 		"atomicExchange",
 		"atomicCompSwap",
 		"atomicAdd",
 		"atomicMin",
 		"atomicMax",
 		"atomicAnd",
 		"atomicOr",
 		"atomicXor"
 	};
 	return de::getSizedArrayElement<ATOMIC_OP_LAST>(s_names, op);
 }

 enum
 {
 	NUM_ELEMENTS = 32
 };

 enum DataType
 {
 	DATA_TYPE_FLOAT16 = 0,
 	DATA_TYPE_INT32,
 	DATA_TYPE_UINT32,
 	DATA_TYPE_FLOAT32,
 	DATA_TYPE_INT64,
 	DATA_TYPE_UINT64,
 	DATA_TYPE_FLOAT64,

 	DATA_TYPE_LAST
 };

 std::string dataType2Str(DataType type)
 {
 	static const char* const s_names[] =
 	{
 		"float16_t",
 		"int",
 		"uint",
 		"float",
 		"int64_t",
 		"uint64_t",
 		"double",
 	};
 	return de::getSizedArrayElement<DATA_TYPE_LAST>(s_names, type);
 }

 class BufferInterface
 {
 public:
 	virtual void setBuffer(void* ptr) = 0;

 	virtual size_t bufferSize() = 0;

 	virtual void fillWithTestData(de::Random &rnd) = 0;

 	virtual void checkResults(tcu::ResultCollector& resultCollector) = 0;

 	virtual ~BufferInterface() {}
 };

 template<typename dataTypeT>
 class TestBuffer : public BufferInterface
 {
 public:

 	TestBuffer(AtomicOperation	atomicOp)
 		: m_atomicOp(atomicOp)
 	{}

 	template<typename T>
 	struct BufferData
 	{
 		// Use half the number of elements for inout to cause overlap between atomic operations.
 		// Each inout element at index i will have two atomic operations using input from
 		// indices i and i + NUM_ELEMENTS / 2.
 		T			inout[NUM_ELEMENTS / 2];
 		T			input[NUM_ELEMENTS];
 		T			compare[NUM_ELEMENTS];
 		T			output[NUM_ELEMENTS];
 		T			invocationHitCount[NUM_ELEMENTS];
 		deInt32		index;
 	};

 	virtual void setBuffer(void* ptr)
 	{
 		m_ptr = static_cast<BufferData<dataTypeT>*>(ptr);
 	}

 	virtual size_t bufferSize()
 	{
 		return sizeof(BufferData<dataTypeT>);
 	}

 	virtual void fillWithTestData(de::Random &rnd)
 	{
 		dataTypeT pattern;
 		deMemset(&pattern, 0xcd, sizeof(dataTypeT));

 		for (int i = 0; i < NUM_ELEMENTS / 2; i++)
 		{
 			m_ptr->inout[i] = static_cast<dataTypeT>(rnd.getUint64());
 			// The first half of compare elements match with every even index.
 			// The second half matches with odd indices. This causes the
 			// overlapping operations to only select one.
 			m_ptr->compare[i] = m_ptr->inout[i] + (i % 2);
 			m_ptr->compare[i + NUM_ELEMENTS / 2] = m_ptr->inout[i] + 1 - (i % 2);
 		}
 		for (int i = 0; i < NUM_ELEMENTS; i++)
 		{
 			m_ptr->input[i] = static_cast<dataTypeT>(rnd.getUint64());
 			m_ptr->output[i] = pattern;
 			m_ptr->invocationHitCount[i] = 0;
 		}
 		m_ptr->index = 0;

 		// Take a copy to be used when calculating expected values.
 		m_original = *m_ptr;
 	}

 	virtual void checkResults(tcu::ResultCollector&	resultCollector)
 	{
 		checkOperation(m_original, *m_ptr, resultCollector);
 	}

 	template<typename T>
 	struct Expected
 	{
 		T m_inout;
 		T m_output[2];

 		Expected (T inout, T output0, T output1)
 		: m_inout(inout)
 		{
 			m_output[0] = output0;
 			m_output[1] = output1;
 		}

 		bool compare (T inout, T output0, T output1)
 		{
 			return (deMemCmp((const void*)&m_inout, (const void*)&inout, sizeof(inout)) == 0
 					&& deMemCmp((const void*)&m_output[0], (const void*)&output0, sizeof(output0)) == 0
 					&& deMemCmp((const void*)&m_output[1], (const void*)&output1, sizeof(output1)) == 0);
 		}
 	};

 	void checkOperation	(const BufferData<dataTypeT>&	original,
 						 const BufferData<dataTypeT>&	result,
 						 tcu::ResultCollector&			resultCollector);

 	const AtomicOperation	m_atomicOp;

 	BufferData<dataTypeT>* m_ptr;
 	BufferData<dataTypeT>  m_original;

 };

 template<typename T>
 bool nanSafeSloppyEquals(T x, T y)
 {
 	if (deIsIEEENaN(x) && deIsIEEENaN(y))
 		return true;

 	if (deIsIEEENaN(x) || deIsIEEENaN(y))
 		return false;

 	return fabs(deToDouble(x) - deToDouble(y)) < 0.00001;
 }

 template<typename dataTypeT>
 class TestBufferFloatingPoint : public BufferInterface
 {
 public:

 	TestBufferFloatingPoint(AtomicOperation	atomicOp)
 		: m_atomicOp(atomicOp)
 	{}

 	template<typename T>
 	struct BufferDataFloatingPoint
 	{
 		// Use half the number of elements for inout to cause overlap between atomic operations.
 		// Each inout element at index i will have two atomic operations using input from
 		// indices i and i + NUM_ELEMENTS / 2.
 		T			inout[NUM_ELEMENTS / 2];
 		T			input[NUM_ELEMENTS];
 		T			compare[NUM_ELEMENTS];
 		T			output[NUM_ELEMENTS];
 		deInt32		invocationHitCount[NUM_ELEMENTS];
 		deInt32		index;
 	};

 	virtual void setBuffer(void* ptr)
 	{
 		m_ptr = static_cast<BufferDataFloatingPoint<dataTypeT>*>(ptr);
 	}

 	virtual size_t bufferSize()
 	{
 		return sizeof(BufferDataFloatingPoint<dataTypeT>);
 	}

 	virtual void fillWithTestData(de::Random& rnd)
 	{
 		dataTypeT pattern;
 		deMemset(&pattern, 0xcd, sizeof(dataTypeT));

 		for (int i = 0; i < NUM_ELEMENTS / 2; i++)
 		{
 			m_ptr->inout[i] = deToFloatType<dataTypeT>(rnd.getFloat());
 			// These aren't used by any of the float tests
 			m_ptr->compare[i] = deToFloatType<dataTypeT>(0.0);
 		}
 		// Add special cases for NaN and +/-0
 		// 0: min(sNaN, x)
 		m_ptr->inout[0] = deSignalingNaN<dataTypeT>();
 		// 1: min(x, sNaN)
 		m_ptr->input[1 * 2 + 0] = deSignalingNaN<dataTypeT>();
 		// 2: min(qNaN, x)
 		m_ptr->inout[2] = deQuietNaN<dataTypeT>();
 		// 3: min(x, qNaN)
 		m_ptr->input[3 * 2 + 0] = deQuietNaN<dataTypeT>();
 		// 4: min(NaN, NaN)
 		m_ptr->inout[4] = deSignalingNaN<dataTypeT>();
 		m_ptr->input[4 * 2 + 0] = deQuietNaN<dataTypeT>();
 		m_ptr->input[4 * 2 + 1] = deQuietNaN<dataTypeT>();
 		// 5: min(+0, -0)
 		m_ptr->inout[5] = deToFloatType<dataTypeT>(-0.0);
 		m_ptr->input[5 * 2 + 0] = deToFloatType<dataTypeT>(0.0);
 		m_ptr->input[5 * 2 + 1] = deToFloatType<dataTypeT>(0.0);

 		for (int i = 0; i < NUM_ELEMENTS; i++)
 		{
 			m_ptr->input[i] = deToFloatType<dataTypeT>(rnd.getFloat());
 			m_ptr->output[i] = pattern;
 			m_ptr->invocationHitCount[i] = 0;
 		}

 		m_ptr->index = 0;

 		// Take a copy to be used when calculating expected values.
 		m_original = *m_ptr;
 	}

 	virtual void checkResults(tcu::ResultCollector& resultCollector)
 	{
 		checkOperationFloatingPoint(m_original, *m_ptr, resultCollector);
 	}

 	template<typename T>
 	struct Expected
 	{
 		T m_inout;
 		T m_output[2];

 		Expected(T inout, T output0, T output1)
 			: m_inout(inout)
 		{
 			m_output[0] = output0;
 			m_output[1] = output1;
 		}

 		bool compare(T inout, T output0, T output1)
 		{
 			return nanSafeSloppyEquals(m_inout, inout) &&
 			       nanSafeSloppyEquals(m_output[0], output0) &&
 			       nanSafeSloppyEquals(m_output[1], output1);
 		}
 	};

 	void checkOperationFloatingPoint(const BufferDataFloatingPoint<dataTypeT>& original,
 		const BufferDataFloatingPoint<dataTypeT>& result,
 		tcu::ResultCollector& resultCollector);

 	const AtomicOperation	m_atomicOp;

 	BufferDataFloatingPoint<dataTypeT>* m_ptr;
 	BufferDataFloatingPoint<dataTypeT>  m_original;

 };

 static BufferInterface* createTestBuffer(DataType type, AtomicOperation atomicOp)
 {
 	switch (type)
 	{
 	case DATA_TYPE_FLOAT16:
 		return new TestBufferFloatingPoint<deFloat16>(atomicOp);
 	case DATA_TYPE_INT32:
 		return new TestBuffer<deInt32>(atomicOp);
 	case DATA_TYPE_UINT32:
 		return new TestBuffer<deUint32>(atomicOp);
 	case DATA_TYPE_FLOAT32:
 		return new TestBufferFloatingPoint<float>(atomicOp);
 	case DATA_TYPE_INT64:
 		return new TestBuffer<deInt64>(atomicOp);
 	case DATA_TYPE_UINT64:
 		return new TestBuffer<deUint64>(atomicOp);
 	case DATA_TYPE_FLOAT64:
 		return new TestBufferFloatingPoint<double>(atomicOp);
 	default:
 		DE_ASSERT(false);
 		return DE_NULL;
 	}
 }

 // Use template to handle both signed and unsigned cases. SPIR-V should
 // have separate operations for both.
 template<typename T>
 void TestBuffer<T>::checkOperation (const BufferData<T>&	original,
 									const BufferData<T>&	result,
 									tcu::ResultCollector&	resultCollector)
 {
 	// originalInout = original inout
 	// input0 = input at index i
 	// iinput1 = input at index i + NUM_ELEMENTS / 2
 	//
 	// atomic operation will return the memory contents before
 	// the operation and this is stored as output. Two operations
 	// are executed for each InOut value (using input0 and input1).
 	//
 	// Since there is an overlap of two operations per each
 	// InOut element, the outcome of the resulting InOut and
 	// the outputs of the operations have two result candidates
 	// depending on the execution order. Verification passes
 	// if the results match one of these options.

 	for (int elementNdx = 0; elementNdx < NUM_ELEMENTS / 2; elementNdx++)
 	{
 		// Needed when reinterpeting the data as signed values.
 		const T originalInout	= *reinterpret_cast<const T*>(&original.inout[elementNdx]);
 		const T input0			= *reinterpret_cast<const T*>(&original.input[elementNdx]);
 		const T input1			= *reinterpret_cast<const T*>(&original.input[elementNdx + NUM_ELEMENTS / 2]);

 		// Expected results are collected to this vector.
 		vector<Expected<T> > exp;

 		switch (m_atomicOp)
 		{
 			case ATOMIC_OP_ADD:
 			{
 				exp.push_back(Expected<T>(originalInout + input0 + input1, originalInout, originalInout + input0));
 				exp.push_back(Expected<T>(originalInout + input0 + input1, originalInout + input1, originalInout));
 			}
 			break;

 			case ATOMIC_OP_AND:
 			{
 				exp.push_back(Expected<T>(originalInout & input0 & input1, originalInout, originalInout & input0));
 				exp.push_back(Expected<T>(originalInout & input0 & input1, originalInout & input1, originalInout));
 			}
 			break;

 			case ATOMIC_OP_OR:
 			{
 				exp.push_back(Expected<T>(originalInout | input0 | input1, originalInout, originalInout | input0));
 				exp.push_back(Expected<T>(originalInout | input0 | input1, originalInout | input1, originalInout));
 			}
 			break;

 			case ATOMIC_OP_XOR:
 			{
 				exp.push_back(Expected<T>(originalInout ^ input0 ^ input1, originalInout, originalInout ^ input0));
 				exp.push_back(Expected<T>(originalInout ^ input0 ^ input1, originalInout ^ input1, originalInout));
 			}
 			break;

 			case ATOMIC_OP_MIN:
 			{
 				exp.push_back(Expected<T>(de::min(de::min(originalInout, input0), input1), originalInout, de::min(originalInout, input0)));
 				exp.push_back(Expected<T>(de::min(de::min(originalInout, input0), input1), de::min(originalInout, input1), originalInout));
 			}
 			break;

 			case ATOMIC_OP_MAX:
 			{
 				exp.push_back(Expected<T>(de::max(de::max(originalInout, input0), input1), originalInout, de::max(originalInout, input0)));
 				exp.push_back(Expected<T>(de::max(de::max(originalInout, input0), input1), de::max(originalInout, input1), originalInout));
 			}
 			break;

 			case ATOMIC_OP_EXCHANGE:
 			{
 				exp.push_back(Expected<T>(input1, originalInout, input0));
 				exp.push_back(Expected<T>(input0, input1, originalInout));
 			}
 			break;

 			case ATOMIC_OP_COMP_SWAP:
 			{
 				if (elementNdx % 2 == 0)
 				{
 					exp.push_back(Expected<T>(input0, originalInout, input0));
 					exp.push_back(Expected<T>(input0, originalInout, originalInout));
 				}
 				else
 				{
 					exp.push_back(Expected<T>(input1, input1, originalInout));
 					exp.push_back(Expected<T>(input1, originalInout, originalInout));
 				}
 			}
 			break;


 			default:
 				DE_FATAL("Unexpected atomic operation.");
 				break;
 		}

 		const T resIo		= result.inout[elementNdx];
 		const T resOutput0	= result.output[elementNdx];
 		const T resOutput1	= result.output[elementNdx + NUM_ELEMENTS / 2];


 		if (!exp[0].compare(resIo, resOutput0, resOutput1) && !exp[1].compare(resIo, resOutput0, resOutput1))
 		{
 			std::ostringstream errorMessage;
 			errorMessage	<< "ERROR: Result value check failed at index " << elementNdx
 							<< ". Expected one of the two outcomes: InOut = " << tcu::toHex(exp[0].m_inout)
 							<< ", Output0 = " << tcu::toHex(exp[0].m_output[0]) << ", Output1 = "
 							<< tcu::toHex(exp[0].m_output[1]) << ", or InOut = " << tcu::toHex(exp[1].m_inout)
 							<< ", Output0 = " << tcu::toHex(exp[1].m_output[0]) << ", Output1 = "
 							<< tcu::toHex(exp[1].m_output[1]) << ". Got: InOut = " << tcu::toHex(resIo)
 							<< ", Output0 = " << tcu::toHex(resOutput0) << ", Output1 = "
 							<< tcu::toHex(resOutput1) << ". Using Input0 = " << tcu::toHex(original.input[elementNdx])
 							<< " and Input1 = " << tcu::toHex(original.input[elementNdx + NUM_ELEMENTS / 2]) << ".";

 			resultCollector.fail(errorMessage.str());
 		}
 	}
 }

 template<typename T>
 void handleExceptionalFloatMinMaxValues(vector<T> &values, T x, T y)
 {

 	if (deIsSignalingNaN(x) && deIsSignalingNaN(y))
 	{
 		values.push_back(deQuietNaN<T>());
 		values.push_back(deSignalingNaN<T>());
 	}
 	else if (deIsSignalingNaN(x))
 	{
 		values.push_back(deQuietNaN<T>());
 		values.push_back(deSignalingNaN<T>());
 		if (!deIsIEEENaN(y))
 			values.push_back(y);
 	}
 	else if (deIsSignalingNaN(y))
 	{
 		values.push_back(deQuietNaN<T>());
 		values.push_back(deSignalingNaN<T>());
 		if (!deIsIEEENaN(x))
 			values.push_back(x);
 	}
 	else if (deIsIEEENaN(x) && deIsIEEENaN(y))
 	{
 		// Both quiet NaNs
 		values.push_back(deQuietNaN<T>());
 	}
 	else if (deIsIEEENaN(x))
 	{
 		// One quiet NaN and one non-NaN.
 		values.push_back(y);
 	}
 	else if (deIsIEEENaN(y))
 	{
 		// One quiet NaN and one non-NaN.
 		values.push_back(x);
 	}
 	else if ((deIsPositiveZero(x) && deIsNegativeZero(y)) || (deIsNegativeZero(x) && deIsPositiveZero(y)))
 	{
 		values.push_back(deToFloatType<T>(0.0));
 		values.push_back(deToFloatType<T>(-0.0));
 	}
 }

 template<typename T>
 T floatAdd(T x, T y)
 {
 	if (deIsIEEENaN(x) || deIsIEEENaN(y))
 		return deQuietNaN<T>();
 	return deToFloatType<T>(deToDouble(x) + deToDouble(y));
 }

 template<typename T>
 vector<T> floatMinValues(T x, T y)
 {
 	vector<T> values;
 	handleExceptionalFloatMinMaxValues(values, x, y);
 	if (values.empty())
 	{
 		values.push_back(deToDouble(x) < deToDouble(y) ? x : y);
 	}
 	return values;
 }

 template<typename T>
 vector<T> floatMaxValues(T x, T y)
 {
 	vector<T> values;
 	handleExceptionalFloatMinMaxValues(values, x, y);
 	if (values.empty())
 	{
 		values.push_back(deToDouble(x) > deToDouble(y) ? x : y);
 	}
 	return values;
 }

 // Use template to handle both float and double cases. SPIR-V should
 // have separate operations for both.
 template<typename T>
 void TestBufferFloatingPoint<T>::checkOperationFloatingPoint(const BufferDataFloatingPoint<T>& original,
 	const BufferDataFloatingPoint<T>& result,
 	tcu::ResultCollector& resultCollector)
 {
 	// originalInout = original inout
 	// input0 = input at index i
 	// iinput1 = input at index i + NUM_ELEMENTS / 2
 	//
 	// atomic operation will return the memory contents before
 	// the operation and this is stored as output. Two operations
 	// are executed for each InOut value (using input0 and input1).
 	//
 	// Since there is an overlap of two operations per each
 	// InOut element, the outcome of the resulting InOut and
 	// the outputs of the operations have two result candidates
 	// depending on the execution order. Verification passes
 	// if the results match one of these options.

 	for (int elementNdx = 0; elementNdx < NUM_ELEMENTS / 2; elementNdx++)
 	{
 		// Needed when reinterpeting the data as signed values.
 		const T originalInout = *reinterpret_cast<const T*>(&original.inout[elementNdx]);
 		const T input0 = *reinterpret_cast<const T*>(&original.input[elementNdx]);
 		const T input1 = *reinterpret_cast<const T*>(&original.input[elementNdx + NUM_ELEMENTS / 2]);

 		// Expected results are collected to this vector.
 		vector<Expected<T> > exp;

 		switch (m_atomicOp)
 		{
 		case ATOMIC_OP_ADD:
 		{
 			exp.push_back(Expected<T>(floatAdd(floatAdd(originalInout, input0), input1), originalInout, floatAdd(originalInout, input0)));
 			exp.push_back(Expected<T>(floatAdd(floatAdd(originalInout, input0), input1), floatAdd(originalInout, input1), originalInout));
 		}
 		break;

 		case ATOMIC_OP_MIN:
 		{
 			// The case where input0 is combined first
 			vector<T> minOriginalAndInput0 = floatMinValues(originalInout, input0);
 			for (T x : minOriginalAndInput0)
 			{
 				vector<T> minAll = floatMinValues(x, input1);
 				for (T y : minAll)
 				{
 					exp.push_back(Expected<T>(y, originalInout, x));
 				}
 			}

 			// The case where input1 is combined first
 			vector<T> minOriginalAndInput1 = floatMinValues(originalInout, input1);
 			for (T x : minOriginalAndInput1)
 			{
 				vector<T> minAll = floatMinValues(x, input0);
 				for (T y : minAll)
 				{
 					exp.push_back(Expected<T>(y, x, originalInout));
 				}
 			}
 		}
 		break;

 		case ATOMIC_OP_MAX:
 		{
 			// The case where input0 is combined first
 			vector<T> minOriginalAndInput0 = floatMaxValues(originalInout, input0);
 			for (T x : minOriginalAndInput0)
 			{
 				vector<T> minAll = floatMaxValues(x, input1);
 				for (T y : minAll)
 				{
 					exp.push_back(Expected<T>(y, originalInout, x));
 				}
 			}

 			// The case where input1 is combined first
 			vector<T> minOriginalAndInput1 = floatMaxValues(originalInout, input1);
 			for (T x : minOriginalAndInput1)
 			{
 				vector<T> minAll = floatMaxValues(x, input0);
 				for (T y : minAll)
 				{
 					exp.push_back(Expected<T>(y, x, originalInout));
 				}
 			}
 		}
 		break;

 		case ATOMIC_OP_EXCHANGE:
 		{
 			exp.push_back(Expected<T>(input1, originalInout, input0));
 			exp.push_back(Expected<T>(input0, input1, originalInout));
 		}
 		break;

 		default:
 			DE_FATAL("Unexpected atomic operation.");
 			break;
 		}

 		const T resIo = result.inout[elementNdx];
 		const T resOutput0 = result.output[elementNdx];
 		const T resOutput1 = result.output[elementNdx + NUM_ELEMENTS / 2];


 		bool hasMatch = false;
 		for (Expected<T> e : exp)
 		{
 			if (e.compare(resIo, resOutput0, resOutput1))
 			{
 				hasMatch = true;
 				break;
 			}
 		}
 		if (!hasMatch)
 		{
 			std::ostringstream errorMessage;
 			errorMessage << "ERROR: Result value check failed at index " << elementNdx
 				<< ". Expected one of the outcomes:";

 			bool first = true;
 			for (Expected<T> e : exp)
 			{
 				if (!first)
 					errorMessage << ", or";
 				first = false;

 				errorMessage << " InOut = " << e.m_inout
 					<< ", Output0 = " << e.m_output[0]
 					<< ", Output1 = " << e.m_output[1];
 			}

 			errorMessage << ". Got: InOut = " << resIo
 				<< ", Output0 = " << resOutput0
 				<< ", Output1 = " << resOutput1
 				<< ". Using Input0 = " << original.input[elementNdx]
 				<< " and Input1 = " << original.input[elementNdx + NUM_ELEMENTS / 2] << ".";

 			resultCollector.fail(errorMessage.str());
 		}
 	}
 }

 class AtomicOperationCaseInstance : public TestInstance
 {
 public:
 									AtomicOperationCaseInstance		(Context&			context,
 																	 const ShaderSpec&	shaderSpec,
 																	 AtomicShaderType	shaderType,
 																	 DataType			dataType,
 																	 AtomicOperation	atomicOp);

 	virtual tcu::TestStatus			iterate							(void);

 private:
 	const ShaderSpec&				m_shaderSpec;
 	AtomicShaderType				m_shaderType;
 	const DataType					m_dataType;
 	AtomicOperation					m_atomicOp;

 };

 AtomicOperationCaseInstance::AtomicOperationCaseInstance (Context&				context,
 														  const ShaderSpec&		shaderSpec,
 														  AtomicShaderType		shaderType,
 														  DataType				dataType,
 														  AtomicOperation		atomicOp)
 	: TestInstance	(context)
 	, m_shaderSpec	(shaderSpec)
 	, m_shaderType	(shaderType)
 	, m_dataType	(dataType)
 	, m_atomicOp	(atomicOp)
 {
 }

 tcu::TestStatus AtomicOperationCaseInstance::iterate(void)
 {
 	de::UniquePtr<BufferInterface>	testBuffer	(createTestBuffer(m_dataType, m_atomicOp));
 	tcu::TestLog&					log			= m_context.getTestContext().getLog();
 	const DeviceInterface&			vkd			= m_context.getDeviceInterface();
 	const VkDevice					device		= m_context.getDevice();
 	de::Random						rnd			(0x62a15e34);
 	const bool						useRef		= (m_shaderType.getMemoryType() == AtomicMemoryType::REFERENCE);
 	const VkDescriptorType			descType	= (useRef ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
 	const VkBufferUsageFlags		usageFlags	= (VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | (useRef ? static_cast<VkBufferUsageFlags>(VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT) : 0u));

 	// The main buffer will hold test data. When using buffer references, the buffer's address will be indirectly passed as part of
 	// a uniform buffer. If not, it will be passed directly as a descriptor.
 	Buffer							buffer		(m_context, usageFlags, testBuffer->bufferSize(), useRef);
 	std::unique_ptr<Buffer>			auxBuffer;

 	if (useRef)
 	{
 		// Pass the main buffer address inside a uniform buffer.
 		const VkBufferDeviceAddressInfo addressInfo =
 		{
 			VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO,	//	VkStructureType	sType;
 			nullptr,										//	const void*		pNext;
 			buffer.getBuffer(),								//	VkBuffer		buffer;
 		};
 		const auto address = vkd.getBufferDeviceAddress(device, &addressInfo);

 		auxBuffer.reset(new Buffer(m_context, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, sizeof(address), false));
 		deMemcpy(auxBuffer->getHostPtr(), &address, sizeof(address));
 		auxBuffer->flush();
 	}

 	testBuffer->setBuffer(buffer.getHostPtr());
 	testBuffer->fillWithTestData(rnd);

 	buffer.flush();

 	Move<VkDescriptorSetLayout>	extraResourcesLayout;
 	Move<VkDescriptorPool>		extraResourcesSetPool;
 	Move<VkDescriptorSet>		extraResourcesSet;

 	const VkDescriptorSetLayoutBinding bindings[] =
 	{
 		{ 0u, descType, 1, VK_SHADER_STAGE_ALL, DE_NULL }
 	};

 	const VkDescriptorSetLayoutCreateInfo	layoutInfo	=
 	{
 		VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
 		DE_NULL,
 		(VkDescriptorSetLayoutCreateFlags)0u,
 		DE_LENGTH_OF_ARRAY(bindings),
 		bindings
 	};

 	extraResourcesLayout = createDescriptorSetLayout(vkd, device, &layoutInfo);

 	const VkDescriptorPoolSize poolSizes[] =
 	{
 		{ descType, 1u }
 	};

 	const VkDescriptorPoolCreateInfo poolInfo =
 	{
 		VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
 		DE_NULL,
 		(VkDescriptorPoolCreateFlags)VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
 		1u,		// maxSets
 		DE_LENGTH_OF_ARRAY(poolSizes),
 		poolSizes
 	};

 	extraResourcesSetPool = createDescriptorPool(vkd, device, &poolInfo);

 	const VkDescriptorSetAllocateInfo allocInfo =
 	{
 		VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
 		DE_NULL,
 		*extraResourcesSetPool,
 		1u,
 		&extraResourcesLayout.get()
 	};

 	extraResourcesSet = allocateDescriptorSet(vkd, device, &allocInfo);

 	VkDescriptorBufferInfo bufferInfo;
 	bufferInfo.buffer	= (useRef ? auxBuffer->getBuffer() : buffer.getBuffer());
 	bufferInfo.offset	= 0u;
 	bufferInfo.range	= VK_WHOLE_SIZE;

 	const VkWriteDescriptorSet descriptorWrite =
 	{
 		VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
 		DE_NULL,
 		*extraResourcesSet,
 		0u,		// dstBinding
 		0u,		// dstArrayElement
 		1u,
 		descType,
 		(const VkDescriptorImageInfo*)DE_NULL,
 		&bufferInfo,
 		(const VkBufferView*)DE_NULL
 	};

 	vkd.updateDescriptorSets(device, 1u, &descriptorWrite, 0u, DE_NULL);

 	// Storage for output varying data.
 	std::vector<deUint32>	outputs		(NUM_ELEMENTS);
 	std::vector<void*>		outputPtr	(NUM_ELEMENTS);

 	for (size_t i = 0; i < NUM_ELEMENTS; i++)
 	{
 		outputs[i] = 0xcdcdcdcd;
 		outputPtr[i] = &outputs[i];
 	}

 	const int					numWorkGroups	= ((m_shaderType.getMemoryType() == AtomicMemoryType::SHARED) ? 1 : static_cast<int>(NUM_ELEMENTS));
 	UniquePtr<ShaderExecutor>	executor		(createExecutor(m_context, m_shaderType.getType(), m_shaderSpec, *extraResourcesLayout));

 	executor->execute(numWorkGroups, DE_NULL, &outputPtr[0], *extraResourcesSet);
 	buffer.invalidate();

 	tcu::ResultCollector resultCollector(log);

 	// Check the results of the atomic operation
 	testBuffer->checkResults(resultCollector);

 	return tcu::TestStatus(resultCollector.getResult(), resultCollector.getMessage());
 }

 class AtomicOperationCase : public TestCase
 {
 public:
 							AtomicOperationCase		(tcu::TestContext&		testCtx,
 													 const char*			name,
 													 const char*			description,
 													 AtomicShaderType		type,
 													 DataType				dataType,
 													 AtomicOperation		atomicOp);
 	virtual					~AtomicOperationCase	(void);

 	virtual TestInstance*	createInstance			(Context& ctx) const;
 	virtual void			checkSupport			(Context& ctx) const;
 	virtual void			initPrograms			(vk::SourceCollections& programCollection) const
 	{
 		generateSources(m_shaderType.getType(), m_shaderSpec, programCollection);
 	}

 private:

 	void					createShaderSpec();
 	ShaderSpec				m_shaderSpec;
 	const AtomicShaderType	m_shaderType;
 	const DataType			m_dataType;
 	const AtomicOperation	m_atomicOp;
 };

 AtomicOperationCase::AtomicOperationCase (tcu::TestContext&	testCtx,
 										  const char*		name,
 										  const char*		description,
 										  AtomicShaderType	shaderType,
 										  DataType			dataType,
 										  AtomicOperation	atomicOp)
 	: TestCase			(testCtx, name, description)
 	, m_shaderType		(shaderType)
 	, m_dataType		(dataType)
 	, m_atomicOp		(atomicOp)
 {
 	createShaderSpec();
 	init();
 }

 AtomicOperationCase::~AtomicOperationCase (void)
 {
 }

 TestInstance* AtomicOperationCase::createInstance (Context& ctx) const
 {
 	return new AtomicOperationCaseInstance(ctx, m_shaderSpec, m_shaderType, m_dataType, m_atomicOp);
 }

 void AtomicOperationCase::checkSupport (Context& ctx) const
 {
 	if ((m_dataType == DATA_TYPE_INT64) || (m_dataType == DATA_TYPE_UINT64))
 	{
 		ctx.requireDeviceFunctionality("VK_KHR_shader_atomic_int64");

 		const auto atomicInt64Features	= ctx.getShaderAtomicInt64Features();
 		const bool isSharedMemory		= (m_shaderType.getMemoryType() == AtomicMemoryType::SHARED);

 		if (!isSharedMemory && atomicInt64Features.shaderBufferInt64Atomics == VK_FALSE)
 		{
 			TCU_THROW(NotSupportedError, "VkShaderAtomicInt64: 64-bit integer atomic operations not supported for buffers");
 		}
 		if (isSharedMemory && atomicInt64Features.shaderSharedInt64Atomics == VK_FALSE)
 		{
 			TCU_THROW(NotSupportedError, "VkShaderAtomicInt64: 64-bit integer atomic operations not supported for shared memory");
 		}
 	}

 	if (m_dataType == DATA_TYPE_FLOAT16)
 	{
 		ctx.requireDeviceFunctionality("VK_EXT_shader_atomic_float2");
 		if (m_atomicOp == ATOMIC_OP_ADD)
 		{
 			if (m_shaderType.getMemoryType() == AtomicMemoryType::SHARED)
 			{
 				if (!ctx.getShaderAtomicFloat2FeaturesEXT().shaderSharedFloat16AtomicAdd)
 				{
 					TCU_THROW(NotSupportedError, "VkShaderAtomicFloat16: 16-bit floating point shared add atomic operation not supported");
 				}
 			}
 			else
 			{
 				if (!ctx.getShaderAtomicFloat2FeaturesEXT().shaderBufferFloat16AtomicAdd)
 				{
 					TCU_THROW(NotSupportedError, "VkShaderAtomicFloat16: 16-bit floating point buffer add atomic operation not supported");
 				}
 			}
 		}
 		if (m_atomicOp == ATOMIC_OP_MIN || m_atomicOp == ATOMIC_OP_MAX)
 		{
 			if (m_shaderType.getMemoryType() == AtomicMemoryType::SHARED)
 			{
 				if (!ctx.getShaderAtomicFloat2FeaturesEXT().shaderSharedFloat16AtomicMinMax)
 				{
 					TCU_THROW(NotSupportedError, "VkShaderAtomicFloat16: 16-bit floating point shared min/max atomic operation not supported");
 				}
 			}
 			else
 			{
 				if (!ctx.getShaderAtomicFloat2FeaturesEXT().shaderBufferFloat16AtomicMinMax)
 				{
 					TCU_THROW(NotSupportedError, "VkShaderAtomicFloat16: 16-bit floating point buffer min/max atomic operation not supported");
 				}
 			}
 		}
 		if (m_atomicOp == ATOMIC_OP_EXCHANGE)
 		{
 			if (m_shaderType.getMemoryType() == AtomicMemoryType::SHARED)
 			{
 				if (!ctx.getShaderAtomicFloat2FeaturesEXT().shaderSharedFloat16Atomics)
 				{
 					TCU_THROW(NotSupportedError, "VkShaderAtomicFloat16: 16-bit floating point shared atomic operations not supported");
 				}
 			}
 			else
 			{
 				if (!ctx.getShaderAtomicFloat2FeaturesEXT().shaderBufferFloat16Atomics)
 				{
 					TCU_THROW(NotSupportedError, "VkShaderAtomicFloat16: 16-bit floating point buffer atomic operations not supported");
 				}
 			}
 		}
 	}

 	if (m_dataType == DATA_TYPE_FLOAT32)
 	{
 		ctx.requireDeviceFunctionality("VK_EXT_shader_atomic_float");
 		if (m_atomicOp == ATOMIC_OP_ADD)
 		{
 			if (m_shaderType.getMemoryType() == AtomicMemoryType::SHARED)
 			{
 				if (!ctx.getShaderAtomicFloatFeaturesEXT().shaderSharedFloat32AtomicAdd)
 				{
 					TCU_THROW(NotSupportedError, "VkShaderAtomicFloat32: 32-bit floating point shared add atomic operation not supported");
 				}
 			}
 			else
 			{
 				if (!ctx.getShaderAtomicFloatFeaturesEXT().shaderBufferFloat32AtomicAdd)
 				{
 					TCU_THROW(NotSupportedError, "VkShaderAtomicFloat32: 32-bit floating point buffer add atomic operation not supported");
 				}
 			}
 		}
 		if (m_atomicOp == ATOMIC_OP_MIN || m_atomicOp == ATOMIC_OP_MAX)
 		{
 			ctx.requireDeviceFunctionality("VK_EXT_shader_atomic_float2");
 			if (m_shaderType.getMemoryType() == AtomicMemoryType::SHARED)
 			{
 				if (!ctx.getShaderAtomicFloat2FeaturesEXT().shaderSharedFloat32AtomicMinMax)
 				{
 					TCU_THROW(NotSupportedError, "VkShaderAtomicFloat32: 32-bit floating point shared min/max atomic operation not supported");
 				}
 			}
 			else
 			{
 				if (!ctx.getShaderAtomicFloat2FeaturesEXT().shaderBufferFloat32AtomicMinMax)
 				{
 					TCU_THROW(NotSupportedError, "VkShaderAtomicFloat32: 32-bit floating point buffer min/max atomic operation not supported");
 				}
 			}
 		}
 		if (m_atomicOp == ATOMIC_OP_EXCHANGE)
 		{
 			if (m_shaderType.getMemoryType() == AtomicMemoryType::SHARED)
 			{
 				if (!ctx.getShaderAtomicFloatFeaturesEXT().shaderSharedFloat32Atomics)
 				{
 					TCU_THROW(NotSupportedError, "VkShaderAtomicFloat32: 32-bit floating point shared atomic operations not supported");
 				}
 			}
 			else
 			{
 				if (!ctx.getShaderAtomicFloatFeaturesEXT().shaderBufferFloat32Atomics)
 				{
 					TCU_THROW(NotSupportedError, "VkShaderAtomicFloat32: 32-bit floating point buffer atomic operations not supported");
 				}
 			}
 		}
 	}

 	if (m_dataType == DATA_TYPE_FLOAT64)
 	{
 		ctx.requireDeviceFunctionality("VK_EXT_shader_atomic_float");
 		if (m_atomicOp == ATOMIC_OP_ADD)
 		{
 			if (m_shaderType.getMemoryType() == AtomicMemoryType::SHARED)
 			{
 				if (!ctx.getShaderAtomicFloatFeaturesEXT().shaderSharedFloat64AtomicAdd)
 				{
 					TCU_THROW(NotSupportedError, "VkShaderAtomicFloat64: 64-bit floating point shared add atomic operation not supported");
 				}
 			}
 			else
 			{
 				if (!ctx.getShaderAtomicFloatFeaturesEXT().shaderBufferFloat64AtomicAdd)
 				{
 					TCU_THROW(NotSupportedError, "VkShaderAtomicFloat64: 64-bit floating point buffer add atomic operation not supported");
 				}
 			}
 		}
 		if (m_atomicOp == ATOMIC_OP_MIN || m_atomicOp == ATOMIC_OP_MAX)
 		{
 			ctx.requireDeviceFunctionality("VK_EXT_shader_atomic_float2");
 			if (m_shaderType.getMemoryType() == AtomicMemoryType::SHARED)
 			{
 				if (!ctx.getShaderAtomicFloat2FeaturesEXT().shaderSharedFloat64AtomicMinMax)
 				{
 					TCU_THROW(NotSupportedError, "VkShaderAtomicFloat64: 64-bit floating point shared min/max atomic operation not supported");
 				}
 			}
 			else
 			{
 				if (!ctx.getShaderAtomicFloat2FeaturesEXT().shaderBufferFloat64AtomicMinMax)
 				{
 					TCU_THROW(NotSupportedError, "VkShaderAtomicFloat64: 64-bit floating point buffer min/max atomic operation not supported");
 				}
 			}
 		}
 		if (m_atomicOp == ATOMIC_OP_EXCHANGE)
 		{
 			if (m_shaderType.getMemoryType() == AtomicMemoryType::SHARED)
 			{
 				if (!ctx.getShaderAtomicFloatFeaturesEXT().shaderSharedFloat64Atomics)
 				{
 					TCU_THROW(NotSupportedError, "VkShaderAtomicFloat64: 64-bit floating point shared atomic operations not supported");
 				}
 			}
 			else
 			{
 				if (!ctx.getShaderAtomicFloatFeaturesEXT().shaderBufferFloat64Atomics)
 				{
 					TCU_THROW(NotSupportedError, "VkShaderAtomicFloat64: 64-bit floating point buffer atomic operations not supported");
 				}
 			}
 		}
 	}

 	if (m_shaderType.getMemoryType() == AtomicMemoryType::REFERENCE)
 	{
 		ctx.requireDeviceFunctionality("VK_KHR_buffer_device_address");
 	}

 	// Check stores and atomic operation support.
 	switch (m_shaderType.getType())
 	{
 	case glu::SHADERTYPE_VERTEX:
 	case glu::SHADERTYPE_TESSELLATION_CONTROL:
 	case glu::SHADERTYPE_TESSELLATION_EVALUATION:
 	case glu::SHADERTYPE_GEOMETRY:
 		if (!ctx.getDeviceFeatures().vertexPipelineStoresAndAtomics)
 			TCU_THROW(NotSupportedError, "Stores and atomic operations are not supported in Vertex, Tessellation, and Geometry shader.");
 		break;
 	case glu::SHADERTYPE_FRAGMENT:
 		if (!ctx.getDeviceFeatures().fragmentStoresAndAtomics)
 			TCU_THROW(NotSupportedError, "Stores and atomic operations are not supported in fragment shader.");
 		break;
 	case glu::SHADERTYPE_COMPUTE:
 		break;
 	default:
 		DE_FATAL("Unsupported shader type");
 	}

 	checkSupportShader(ctx, m_shaderType.getType());
 }

 void AtomicOperationCase::createShaderSpec (void)
 {
 	const AtomicMemoryType memoryType = m_shaderType.getMemoryType();

 	// Global declarations.
 	std::ostringstream shaderTemplateGlobalStream;

 	// Structure in use for atomic operations.
 	shaderTemplateGlobalStream
 		<< "${EXTENSIONS}\n"
 		<< "\n"
 		<< "struct AtomicStruct\n"
 		<< "{\n"
 		<< "    ${DATATYPE} inoutValues[${N}/2];\n"
 		<< "    ${DATATYPE} inputValues[${N}];\n"
 		<< "    ${DATATYPE} compareValues[${N}];\n"
 		<< "    ${DATATYPE} outputValues[${N}];\n"
 		<< "    int invocationHitCount[${N}];\n"
 		<< "    int index;\n"
 		<< "};\n"
 		<< "\n"
 		;

 	// The name dance and declarations below will make sure the structure that will be used with atomic operations can be accessed
 	// as "buf.data", which is the name used in the atomic operation statements.
 	//
 	// * When using a buffer directly, RESULT_BUFFER_NAME will be "buf" and the inner struct will be "data".
 	// * When using a workgroup-shared global variable, the "data" struct will be nested in an auxiliar "buf" struct.
 	// * When using buffer references, the uniform buffer reference will be called "buf" and its contents "data".
 	//
 	if (memoryType != AtomicMemoryType::REFERENCE)
 	{
 		shaderTemplateGlobalStream
 			<< "layout (set = ${SETIDX}, binding = 0) buffer AtomicBuffer {\n"
 			<< "    AtomicStruct data;\n"
 			<< "} ${RESULT_BUFFER_NAME};\n"
 			<< "\n"
 			;

 		// When using global shared memory in the compute variant, invocations will use a shared global structure instead of a
 		// descriptor set as the sources and results of each tested operation.
 		if (memoryType == AtomicMemoryType::SHARED)
 		{
 			shaderTemplateGlobalStream
 				<< "shared struct { AtomicStruct data; } buf;\n"
 				<< "\n"
 				;
 		}
 	}
 	else
 	{
 		shaderTemplateGlobalStream
 			<< "layout (buffer_reference) buffer AtomicBuffer {\n"
 			<< "    AtomicStruct data;\n"
 			<< "};\n"
 			<< "\n"
 			<< "layout (set = ${SETIDX}, binding = 0) uniform References {\n"
 			<< "    AtomicBuffer buf;\n"
 			<< "};\n"
 			<< "\n"
 			;
 	}

 	const auto					shaderTemplateGlobalString	= shaderTemplateGlobalStream.str();
 	const tcu::StringTemplate	shaderTemplateGlobal		(shaderTemplateGlobalString);

 	// Shader body for the non-vertex case.
 	std::ostringstream nonVertexShaderTemplateStream;

 	if (memoryType == AtomicMemoryType::SHARED)
 	{
 		// Invocation zero will initialize the shared structure from the descriptor set.
 		nonVertexShaderTemplateStream
 			<< "if (gl_LocalInvocationIndex == 0u)\n"
 			<< "{\n"
 			<< "    buf.data = ${RESULT_BUFFER_NAME}.data;\n"
 			<< "}\n"
 			<< "barrier();\n"
 			;
 	}

 	if (m_shaderType.getType() == glu::SHADERTYPE_FRAGMENT)
 	{
 		nonVertexShaderTemplateStream
 			<< "if (!gl_HelperInvocation) {\n"
 			<< "    int idx = atomicAdd(buf.data.index, 1);\n"
 			<< "    buf.data.outputValues[idx] = ${ATOMICOP}(buf.data.inoutValues[idx % (${N}/2)], ${COMPARE_ARG}buf.data.inputValues[idx]);\n"
 			<< "}\n"
 			;
 	}
 	else
 	{
 		nonVertexShaderTemplateStream
 			<< "if (atomicAdd(buf.data.invocationHitCount[0], 1) < ${N})\n"
 			<< "{\n"
 			<< "    int idx = atomicAdd(buf.data.index, 1);\n"
 			<< "    buf.data.outputValues[idx] = ${ATOMICOP}(buf.data.inoutValues[idx % (${N}/2)], ${COMPARE_ARG}buf.data.inputValues[idx]);\n"
 			<< "}\n"
 			;
 	}

 	if (memoryType == AtomicMemoryType::SHARED)
 	{
 		// Invocation zero will copy results back to the descriptor set.
 		nonVertexShaderTemplateStream
 			<< "barrier();\n"
 			<< "if (gl_LocalInvocationIndex == 0u)\n"
 			<< "{\n"
 			<< "    ${RESULT_BUFFER_NAME}.data = buf.data;\n"
 			<< "}\n"
 			;
 	}

 	const auto					nonVertexShaderTemplateStreamStr	= nonVertexShaderTemplateStream.str();
 	const tcu::StringTemplate	nonVertexShaderTemplateSrc			(nonVertexShaderTemplateStreamStr);

 	// Shader body for the vertex case.
 	const tcu::StringTemplate vertexShaderTemplateSrc(
 		"int idx = gl_VertexIndex;\n"
 		"if (atomicAdd(buf.data.invocationHitCount[idx], 1) == 0)\n"
 		"{\n"
 		"    buf.data.outputValues[idx] = ${ATOMICOP}(buf.data.inoutValues[idx % (${N}/2)], ${COMPARE_ARG}buf.data.inputValues[idx]);\n"
 		"}\n");

 	// Extensions.
 	std::ostringstream extensions;

 	if ((m_dataType == DATA_TYPE_INT64) || (m_dataType == DATA_TYPE_UINT64))
 	{
 		extensions
 			<< "#extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable\n"
 			<< "#extension GL_EXT_shader_atomic_int64 : enable\n"
 			;
 	}
 	else if ((m_dataType == DATA_TYPE_FLOAT16) || (m_dataType == DATA_TYPE_FLOAT32) || (m_dataType == DATA_TYPE_FLOAT64))
 	{
 		extensions
 			<< "#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable\n"
 			<< "#extension GL_EXT_shader_atomic_float : enable\n"
 			<< "#extension GL_EXT_shader_atomic_float2 : enable\n"
 			<< "#extension GL_KHR_memory_scope_semantics : enable\n"
 			;
 	}

 	if (memoryType == AtomicMemoryType::REFERENCE)
 	{
 		extensions << "#extension GL_EXT_buffer_reference : require\n";
 	}

 	// Specializations.
 	std::map<std::string, std::string> specializations;

 	specializations["EXTENSIONS"]			= extensions.str();
 	specializations["DATATYPE"]				= dataType2Str(m_dataType);
 	specializations["ATOMICOP"]				= atomicOp2Str(m_atomicOp);
 	specializations["SETIDX"]				= de::toString((int)EXTRA_RESOURCES_DESCRIPTOR_SET_INDEX);
 	specializations["N"]					= de::toString((int)NUM_ELEMENTS);
 	specializations["COMPARE_ARG"]			= ((m_atomicOp == ATOMIC_OP_COMP_SWAP) ? "buf.data.compareValues[idx], " : "");
 	specializations["RESULT_BUFFER_NAME"]	= ((memoryType == AtomicMemoryType::SHARED) ? "result" : "buf");

 	// Shader spec.
 	m_shaderSpec.outputs.push_back(Symbol("outData", glu::VarType(glu::TYPE_UINT, glu::PRECISION_HIGHP)));
 	m_shaderSpec.glslVersion		= glu::GLSL_VERSION_450;
 	m_shaderSpec.globalDeclarations	= shaderTemplateGlobal.specialize(specializations);
 	m_shaderSpec.source				= ((m_shaderType.getType() == glu::SHADERTYPE_VERTEX)
 										? vertexShaderTemplateSrc.specialize(specializations)
 										: nonVertexShaderTemplateSrc.specialize(specializations));

 	if (memoryType == AtomicMemoryType::SHARED)
 	{
 		// When using global shared memory, use a single workgroup and an appropriate number of local invocations.
 		m_shaderSpec.localSizeX = static_cast<int>(NUM_ELEMENTS);
 	}
 }

 void addAtomicOperationTests (tcu::TestCaseGroup* atomicOperationTestsGroup)
 {
 	tcu::TestContext& testCtx = atomicOperationTestsGroup->getTestContext();

 	static const struct
 	{
 		glu::ShaderType		type;
 		const char*			name;
 	} shaderTypes[] =
 	{
 		{ glu::SHADERTYPE_VERTEX,							"vertex"			},
 		{ glu::SHADERTYPE_FRAGMENT,							"fragment"			},
 		{ glu::SHADERTYPE_GEOMETRY,							"geometry"			},
 		{ glu::SHADERTYPE_TESSELLATION_CONTROL,				"tess_ctrl"			},
 		{ glu::SHADERTYPE_TESSELLATION_EVALUATION,			"tess_eval"			},
 		{ glu::SHADERTYPE_COMPUTE,							"compute"			},
 	};

 	static const struct
 	{
 		AtomicMemoryType	type;
 		const char*			suffix;
 	} kMemoryTypes[] =
 	{
 		{ AtomicMemoryType::BUFFER,		""				},
 		{ AtomicMemoryType::SHARED,		"_shared"		},
 		{ AtomicMemoryType::REFERENCE,	"_reference"	},
 	};

 	static const struct
 	{
 		DataType		dataType;
 		const char*		name;
 		const char*		description;
 	} dataSign[] =
 	{
 		{ DATA_TYPE_FLOAT16,"float16",			"Tests using 16-bit float data"				},
 		{ DATA_TYPE_INT32,	"signed",			"Tests using signed data (int)"				},
 		{ DATA_TYPE_UINT32,	"unsigned",			"Tests using unsigned data (uint)"			},
 		{ DATA_TYPE_FLOAT32,"float32",			"Tests using 32-bit float data"				},
 		{ DATA_TYPE_INT64,	"signed64bit",		"Tests using 64 bit signed data (int64)"	},
 		{ DATA_TYPE_UINT64,	"unsigned64bit",	"Tests using 64 bit unsigned data (uint64)"	},
 		{ DATA_TYPE_FLOAT64,"float64",			"Tests using 64-bit float data)"			}
 	};

 	static const struct
 	{
 		AtomicOperation		value;
 		const char*			name;
 	} atomicOp[] =
 	{
 		{ ATOMIC_OP_EXCHANGE,	"exchange"	},
 		{ ATOMIC_OP_COMP_SWAP,	"comp_swap"	},
 		{ ATOMIC_OP_ADD,		"add"		},
 		{ ATOMIC_OP_MIN,		"min"		},
 		{ ATOMIC_OP_MAX,		"max"		},
 		{ ATOMIC_OP_AND,		"and"		},
 		{ ATOMIC_OP_OR,			"or"		},
 		{ ATOMIC_OP_XOR,		"xor"		}
 	};

 	for (int opNdx = 0; opNdx < DE_LENGTH_OF_ARRAY(atomicOp); opNdx++)
 	{
 		for (int signNdx = 0; signNdx < DE_LENGTH_OF_ARRAY(dataSign); signNdx++)
 		{
 			for (int shaderTypeNdx = 0; shaderTypeNdx < DE_LENGTH_OF_ARRAY(shaderTypes); shaderTypeNdx++)
 			{
 				// Only ADD and EXCHANGE are supported on floating-point
 				if (dataSign[signNdx].dataType == DATA_TYPE_FLOAT16 || dataSign[signNdx].dataType == DATA_TYPE_FLOAT32 || dataSign[signNdx].dataType == DATA_TYPE_FLOAT64)
 				{
 					if (atomicOp[opNdx].value != ATOMIC_OP_ADD &&
 					    atomicOp[opNdx].value != ATOMIC_OP_MIN &&
 					    atomicOp[opNdx].value != ATOMIC_OP_MAX &&
 					    atomicOp[opNdx].value != ATOMIC_OP_EXCHANGE)
 					{
 						continue;
 					}
 				}

 				for (int memoryTypeNdx = 0; memoryTypeNdx < DE_LENGTH_OF_ARRAY(kMemoryTypes); ++memoryTypeNdx)
 				{
 					// Shared memory only available in compute shaders.
 					if (kMemoryTypes[memoryTypeNdx].type == AtomicMemoryType::SHARED && shaderTypes[shaderTypeNdx].type != glu::SHADERTYPE_COMPUTE)
 						continue;

 					const std::string description	= std::string("Tests atomic operation ") + atomicOp2Str(atomicOp[opNdx].value) + std::string(".");
 					const std::string name			= std::string(atomicOp[opNdx].name) + "_" + std::string(dataSign[signNdx].name) + "_" + std::string(shaderTypes[shaderTypeNdx].name) + kMemoryTypes[memoryTypeNdx].suffix;

 					atomicOperationTestsGroup->addChild(new AtomicOperationCase(testCtx, name.c_str(), description.c_str(), AtomicShaderType(shaderTypes[shaderTypeNdx].type, kMemoryTypes[memoryTypeNdx].type), dataSign[signNdx].dataType, atomicOp[opNdx].value));
 				}
 			}
 		}
 	}
 }

 } // anonymous

 tcu::TestCaseGroup* createAtomicOperationTests (tcu::TestContext& testCtx)
 {
 	return createTestGroup(testCtx, "atomic_operations", "Atomic Operation Tests", addAtomicOperationTests);
 }

 } // shaderexecutor
 } // vkt