Merge pull request #1729 from KhronosGroup/fix-1726

MSL: Consider that function/private variables can be block-like.
diff --git a/reference/shaders-msl-no-opt/asm/frag/copy-memory-block-like-thread-local.asm.frag b/reference/shaders-msl-no-opt/asm/frag/copy-memory-block-like-thread-local.asm.frag
new file mode 100644
index 0000000..5af18eb
--- /dev/null
+++ b/reference/shaders-msl-no-opt/asm/frag/copy-memory-block-like-thread-local.asm.frag
@@ -0,0 +1,170 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+#include <metal_stdlib>
+#include <simd/simd.h>
+using namespace metal;
+template<typename T, size_t Num>
+struct spvUnsafeArray
+    T elements[Num ? Num : 1];
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+struct _3
+    float _m0[4];
+template<typename T, uint A>
+inline void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A])
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+template<typename T, uint A>
+inline void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A])
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+template<typename T, uint A>
+inline void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A])
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+template<typename T, uint A>
+inline void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A])
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+template<typename T, uint A>
+inline void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A])
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+template<typename T, uint A>
+inline void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A])
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+template<typename T, uint A>
+inline void spvArrayCopyFromDeviceToDevice1(device T (&dst)[A], device const T (&src)[A])
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+template<typename T, uint A>
+inline void spvArrayCopyFromConstantToDevice1(device T (&dst)[A], constant T (&src)[A])
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+template<typename T, uint A>
+inline void spvArrayCopyFromStackToDevice1(device T (&dst)[A], thread const T (&src)[A])
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+template<typename T, uint A>
+inline void spvArrayCopyFromThreadGroupToDevice1(device T (&dst)[A], threadgroup const T (&src)[A])
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+template<typename T, uint A>
+inline void spvArrayCopyFromDeviceToStack1(thread T (&dst)[A], device const T (&src)[A])
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+template<typename T, uint A>
+inline void spvArrayCopyFromDeviceToThreadGroup1(threadgroup T (&dst)[A], device const T (&src)[A])
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+fragment void main0()
+    spvUnsafeArray<float, 4> _20;
+    _20[0u] = 0.0;
+    _20[1u] = 0.0;
+    _20[2u] = 0.0;
+    _20[3u] = 0.0;
+    _3 _19;
+    spvArrayCopyFromStackToStack1(_19._m0, _20.elements);
diff --git a/shaders-msl-no-opt/asm/frag/copy-memory-block-like-thread-local.asm.frag b/shaders-msl-no-opt/asm/frag/copy-memory-block-like-thread-local.asm.frag
new file mode 100644
index 0000000..ebab7fd
--- /dev/null
+++ b/shaders-msl-no-opt/asm/frag/copy-memory-block-like-thread-local.asm.frag
@@ -0,0 +1,50 @@
+; Version: 1.3
+; Generator: Google rspirv; 0
+; Bound: 43
+; Schema: 0
+               OpCapability ImageQuery
+               OpCapability Int8
+               OpCapability RuntimeDescriptorArray
+               OpCapability StorageImageWriteWithoutFormat
+               OpCapability Shader
+               OpCapability VulkanMemoryModel
+               OpExtension "SPV_EXT_descriptor_indexing"
+               OpExtension "SPV_KHR_vulkan_memory_model"
+               OpMemoryModel Logical Vulkan
+               OpEntryPoint Fragment %1 "main"
+               OpExecutionMode %1 OriginUpperLeft
+               OpDecorate %2 ArrayStride 4
+               OpMemberDecorate %3 0 Offset 0
+          %4 = OpTypeInt 32 0
+          %5 = OpTypeFloat 32
+          %6 = OpTypePointer Function %5
+          %7 = OpTypeVoid
+          %8 = OpTypeFunction %7
+          %9 = OpConstant %4 0
+         %10 = OpConstant %4 1
+         %11 = OpConstant %4 2
+         %12 = OpConstant %4 4
+         %13 = OpConstant %4 3
+         %14 = OpConstant %5 0
+          %2 = OpTypeArray %5 %12
+         %15 = OpTypePointer Function %2
+         %16 = OpTypeFunction %7 %15
+          %3 = OpTypeStruct %2
+         %17 = OpTypePointer Function %3
+          %1 = OpFunction %7 None %8
+         %31 = OpLabel
+         %33 = OpVariable %17 Function
+         %34 = OpVariable %15 Function
+         %39 = OpAccessChain %6 %34 %9
+               OpStore %39 %14
+         %40 = OpAccessChain %6 %34 %10
+               OpStore %40 %14
+         %41 = OpAccessChain %6 %34 %11
+               OpStore %41 %14
+         %42 = OpAccessChain %6 %34 %13
+               OpStore %42 %14
+         %37 = OpAccessChain %15 %33 %9
+               OpCopyMemory %37 %34
+               OpReturn
+               OpFunctionEnd
diff --git a/spirv_msl.cpp b/spirv_msl.cpp
index 1e4aa4b..56794c3 100644
--- a/spirv_msl.cpp
+++ b/spirv_msl.cpp
@@ -8511,13 +8511,27 @@
 	// Special considerations for stage IO variables.
 	// If the variable is actually backed by non-user visible device storage, we use array templates for those.
+	//
+	// Another special consideration is given to thread local variables which happen to have Offset decorations
+	// applied to them. Block-like types do not use array templates, so we need to force POD path if we detect
+	// these scenarios. This check isn't perfect since it would be technically possible to mix and match these things,
+	// and for a fully correct solution we might have to track array template state through access chains as well,
+	// but for all reasonable use cases, this should suffice.
+	// This special case should also only apply to Function/Private storage classes.
+	// We should not check backing variable for temporaries.
 	auto *lhs_var = maybe_get_backing_variable(lhs_id);
 	if (lhs_var && lhs_storage == StorageClassStorageBuffer && storage_class_array_is_thread(lhs_var->storage))
 		lhs_is_array_template = true;
+	else if (lhs_var && (lhs_storage == StorageClassFunction || lhs_storage == StorageClassPrivate) &&
+	         type_is_block_like(get<SPIRType>(lhs_var->basetype)))
+		lhs_is_array_template = false;
 	auto *rhs_var = maybe_get_backing_variable(rhs_id);
 	if (rhs_var && rhs_storage == StorageClassStorageBuffer && storage_class_array_is_thread(rhs_var->storage))
 		rhs_is_array_template = true;
+	else if (rhs_var && (rhs_storage == StorageClassFunction || rhs_storage == StorageClassPrivate) &&
+	         type_is_block_like(get<SPIRType>(rhs_var->basetype)))
+		rhs_is_array_template = false;
 	// If threadgroup storage qualifiers are *not* used:
 	// Avoid spvCopy* wrapper functions; Otherwise, spvUnsafeArray<> template cannot be used with that storage qualifier.