Merge pull request #821 from cdavis5e/pass-sampled-images

MSL: Fix passing a sampled image to a function.
diff --git a/main.cpp b/main.cpp
index 4e4605b..7899109 100644
--- a/main.cpp
+++ b/main.cpp
@@ -492,6 +492,7 @@
 	bool support_nonzero_baseinstance = true;
 	bool msl_swizzle_texture_samples = false;
 	bool msl_ios = false;
+	bool msl_pad_fragment_output = false;
 	vector<PLSArg> pls_in;
 	vector<PLSArg> pls_out;
 	vector<Remap> remaps;
@@ -546,6 +547,7 @@
 	                "\t[--msl-version <MMmmpp>]\n"
 	                "\t[--msl-swizzle-texture-samples]\n"
 	                "\t[--msl-ios]\n"
+	                "\t[--msl-pad-fragment-output]\n"
 	                "\t[--hlsl]\n"
 	                "\t[--reflect]\n"
 	                "\t[--shader-model]\n"
@@ -714,6 +716,7 @@
 	cbs.add("--no-420pack-extension", [&args](CLIParser &) { args.use_420pack_extension = false; });
 	cbs.add("--msl-swizzle-texture-samples", [&args](CLIParser &) { args.msl_swizzle_texture_samples = true; });
 	cbs.add("--msl-ios", [&args](CLIParser &) { args.msl_ios = true; });
+	cbs.add("--msl-pad-fragment-output", [&args](CLIParser &) { args.msl_pad_fragment_output = true; });
 	cbs.add("--extension", [&args](CLIParser &parser) { args.extensions.push_back(parser.next_string()); });
 	cbs.add("--rename-entry-point", [&args](CLIParser &parser) {
 		auto old_name = parser.next_string();
@@ -843,6 +846,7 @@
 		msl_opts.swizzle_texture_samples = args.msl_swizzle_texture_samples;
 		if (args.msl_ios)
 			msl_opts.platform = CompilerMSL::Options::iOS;
+		msl_opts.pad_fragment_output_components = args.msl_pad_fragment_output;
 		msl_comp->set_msl_options(msl_opts);
 	}
 	else if (args.hlsl)
diff --git a/reference/opt/shaders-msl/frag/fragment-component-padding.pad-fragment.frag b/reference/opt/shaders-msl/frag/fragment-component-padding.pad-fragment.frag
new file mode 100644
index 0000000..53aafa5
--- /dev/null
+++ b/reference/opt/shaders-msl/frag/fragment-component-padding.pad-fragment.frag
@@ -0,0 +1,35 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_out
+{
+    float4 FragColors_0 [[color(0)]];
+    float4 FragColors_1 [[color(1)]];
+    float4 FragColor2 [[color(2)]];
+    float4 FragColor3 [[color(3)]];
+};
+
+struct main0_in
+{
+    float3 vColor [[user(locn0)]];
+};
+
+fragment main0_out main0(main0_in in [[stage_in]])
+{
+    main0_out out = {};
+    float FragColors[2] = {};
+    float2 FragColor2 = {};
+    float3 FragColor3 = {};
+    FragColors[0] = in.vColor.x;
+    FragColors[1] = in.vColor.y;
+    FragColor2 = in.vColor.xz;
+    FragColor3 = in.vColor.zzz;
+    out.FragColors_0 = float4(FragColors[0]);
+    out.FragColors_1 = float4(FragColors[1]);
+    out.FragColor2 = FragColor2.xyyy;
+    out.FragColor3 = FragColor3.xyzz;
+    return out;
+}
+
diff --git a/reference/shaders-hlsl-no-opt/vert/pass-array-by-value.vert b/reference/shaders-hlsl-no-opt/vert/pass-array-by-value.vert
new file mode 100644
index 0000000..20afdb5
--- /dev/null
+++ b/reference/shaders-hlsl-no-opt/vert/pass-array-by-value.vert
@@ -0,0 +1,48 @@
+static const float4 _68[4] = { 0.0f.xxxx, 1.0f.xxxx, 2.0f.xxxx, 3.0f.xxxx };
+
+static float4 gl_Position;
+static int Index1;
+static int Index2;
+
+struct SPIRV_Cross_Input
+{
+    int Index1 : TEXCOORD0;
+    int Index2 : TEXCOORD1;
+};
+
+struct SPIRV_Cross_Output
+{
+    float4 gl_Position : SV_Position;
+};
+
+float4 consume_constant_arrays2(float4 positions[4], float4 positions2[4])
+{
+    float4 indexable[4] = positions;
+    float4 indexable_1[4] = positions2;
+    return indexable[Index1] + indexable_1[Index2];
+}
+
+float4 consume_constant_arrays(float4 positions[4], float4 positions2[4])
+{
+    return consume_constant_arrays2(positions, positions2);
+}
+
+void vert_main()
+{
+    float4 LUT2[4];
+    LUT2[0] = 10.0f.xxxx;
+    LUT2[1] = 11.0f.xxxx;
+    LUT2[2] = 12.0f.xxxx;
+    LUT2[3] = 13.0f.xxxx;
+    gl_Position = consume_constant_arrays(_68, LUT2);
+}
+
+SPIRV_Cross_Output main(SPIRV_Cross_Input stage_input)
+{
+    Index1 = stage_input.Index1;
+    Index2 = stage_input.Index2;
+    vert_main();
+    SPIRV_Cross_Output stage_output;
+    stage_output.gl_Position = gl_Position;
+    return stage_output;
+}
diff --git a/reference/shaders-msl-no-opt/vert/pass-array-by-value.vert b/reference/shaders-msl-no-opt/vert/pass-array-by-value.vert
new file mode 100644
index 0000000..ab56313
--- /dev/null
+++ b/reference/shaders-msl-no-opt/vert/pass-array-by-value.vert
@@ -0,0 +1,60 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+constant float4 _68[4] = { float4(0.0), float4(1.0), float4(2.0), float4(3.0) };
+
+struct main0_out
+{
+    float4 gl_Position [[position]];
+};
+
+struct main0_in
+{
+    int Index1 [[attribute(0)]];
+    int Index2 [[attribute(1)]];
+};
+
+// Implementation of an array copy function to cover GLSL's ability to copy an array via assignment.
+template<typename T, uint N>
+void spvArrayCopyFromStack1(thread T (&dst)[N], thread const T (&src)[N])
+{
+    for (uint i = 0; i < N; dst[i] = src[i], i++);
+}
+
+template<typename T, uint N>
+void spvArrayCopyFromConstant1(thread T (&dst)[N], constant T (&src)[N])
+{
+    for (uint i = 0; i < N; dst[i] = src[i], i++);
+}
+
+float4 consume_constant_arrays2(thread const float4 (&positions)[4], thread const float4 (&positions2)[4], thread int& Index1, thread int& Index2)
+{
+    float4 indexable[4];
+    spvArrayCopyFromStack1(indexable, positions);
+    float4 indexable_1[4];
+    spvArrayCopyFromStack1(indexable_1, positions2);
+    return indexable[Index1] + indexable_1[Index2];
+}
+
+float4 consume_constant_arrays(thread const float4 (&positions)[4], thread const float4 (&positions2)[4], thread int& Index1, thread int& Index2)
+{
+    return consume_constant_arrays2(positions, positions2, Index1, Index2);
+}
+
+vertex main0_out main0(main0_in in [[stage_in]])
+{
+    float4 _68_array_copy[4] = { float4(0.0), float4(1.0), float4(2.0), float4(3.0) };
+    main0_out out = {};
+    float4 LUT2[4];
+    LUT2[0] = float4(10.0);
+    LUT2[1] = float4(11.0);
+    LUT2[2] = float4(12.0);
+    LUT2[3] = float4(13.0);
+    out.gl_Position = consume_constant_arrays(_68_array_copy, LUT2, in.Index1, in.Index2);
+    return out;
+}
+
diff --git a/reference/shaders-msl/frag/fragment-component-padding.pad-fragment.frag b/reference/shaders-msl/frag/fragment-component-padding.pad-fragment.frag
new file mode 100644
index 0000000..2d339c4
--- /dev/null
+++ b/reference/shaders-msl/frag/fragment-component-padding.pad-fragment.frag
@@ -0,0 +1,42 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_out
+{
+    float4 FragColors_0 [[color(0)]];
+    float4 FragColors_1 [[color(1)]];
+    float4 FragColor2 [[color(2)]];
+    float4 FragColor3 [[color(3)]];
+};
+
+struct main0_in
+{
+    float3 vColor [[user(locn0)]];
+};
+
+void set_globals(thread float (&FragColors)[2], thread float3& vColor, thread float2& FragColor2, thread float3& FragColor3)
+{
+    FragColors[0] = vColor.x;
+    FragColors[1] = vColor.y;
+    FragColor2 = vColor.xz;
+    FragColor3 = vColor.zzz;
+}
+
+fragment main0_out main0(main0_in in [[stage_in]])
+{
+    main0_out out = {};
+    float FragColors[2] = {};
+    float2 FragColor2 = {};
+    float3 FragColor3 = {};
+    set_globals(FragColors, in.vColor, FragColor2, FragColor3);
+    out.FragColors_0 = float4(FragColors[0]);
+    out.FragColors_1 = float4(FragColors[1]);
+    out.FragColor2 = FragColor2.xyyy;
+    out.FragColor3 = FragColor3.xyzz;
+    return out;
+}
+
diff --git a/reference/shaders-no-opt/vert/pass-array-by-value.vert b/reference/shaders-no-opt/vert/pass-array-by-value.vert
new file mode 100644
index 0000000..45d4210
--- /dev/null
+++ b/reference/shaders-no-opt/vert/pass-array-by-value.vert
@@ -0,0 +1,27 @@
+#version 310 es
+
+layout(location = 0) in int Index1;
+layout(location = 1) in int Index2;
+
+vec4 consume_constant_arrays2(vec4 positions[4], vec4 positions2[4])
+{
+    vec4 indexable[4] = positions;
+    vec4 indexable_1[4] = positions2;
+    return indexable[Index1] + indexable_1[Index2];
+}
+
+vec4 consume_constant_arrays(vec4 positions[4], vec4 positions2[4])
+{
+    return consume_constant_arrays2(positions, positions2);
+}
+
+void main()
+{
+    vec4 LUT2[4];
+    LUT2[0] = vec4(10.0);
+    LUT2[1] = vec4(11.0);
+    LUT2[2] = vec4(12.0);
+    LUT2[3] = vec4(13.0);
+    gl_Position = consume_constant_arrays(vec4[](vec4(0.0), vec4(1.0), vec4(2.0), vec4(3.0)), LUT2);
+}
+
diff --git a/shaders-hlsl-no-opt/vert/pass-array-by-value.vert b/shaders-hlsl-no-opt/vert/pass-array-by-value.vert
new file mode 100644
index 0000000..2c142a7
--- /dev/null
+++ b/shaders-hlsl-no-opt/vert/pass-array-by-value.vert
@@ -0,0 +1,26 @@
+#version 310 es
+
+layout(location = 0) in int Index1;
+layout(location = 1) in int Index2;
+
+vec4 consume_constant_arrays2(const vec4 positions[4], const vec4 positions2[4])
+{
+	return positions[Index1] + positions2[Index2];
+}
+
+vec4 consume_constant_arrays(const vec4 positions[4], const vec4 positions2[4])
+{
+	return consume_constant_arrays2(positions, positions2);
+}
+
+const vec4 LUT1[] = vec4[](vec4(0.0), vec4(1.0), vec4(2.0), vec4(3.0));
+
+void main()
+{
+	vec4 LUT2[4];
+	LUT2[0] = vec4(10.0);
+	LUT2[1] = vec4(11.0);
+	LUT2[2] = vec4(12.0);
+	LUT2[3] = vec4(13.0);
+	gl_Position = consume_constant_arrays(LUT1, LUT2);
+}
diff --git a/shaders-msl-no-opt/vert/pass-array-by-value.vert b/shaders-msl-no-opt/vert/pass-array-by-value.vert
new file mode 100644
index 0000000..2c142a7
--- /dev/null
+++ b/shaders-msl-no-opt/vert/pass-array-by-value.vert
@@ -0,0 +1,26 @@
+#version 310 es
+
+layout(location = 0) in int Index1;
+layout(location = 1) in int Index2;
+
+vec4 consume_constant_arrays2(const vec4 positions[4], const vec4 positions2[4])
+{
+	return positions[Index1] + positions2[Index2];
+}
+
+vec4 consume_constant_arrays(const vec4 positions[4], const vec4 positions2[4])
+{
+	return consume_constant_arrays2(positions, positions2);
+}
+
+const vec4 LUT1[] = vec4[](vec4(0.0), vec4(1.0), vec4(2.0), vec4(3.0));
+
+void main()
+{
+	vec4 LUT2[4];
+	LUT2[0] = vec4(10.0);
+	LUT2[1] = vec4(11.0);
+	LUT2[2] = vec4(12.0);
+	LUT2[3] = vec4(13.0);
+	gl_Position = consume_constant_arrays(LUT1, LUT2);
+}
diff --git a/shaders-msl/frag/fragment-component-padding.pad-fragment.frag b/shaders-msl/frag/fragment-component-padding.pad-fragment.frag
new file mode 100644
index 0000000..240c59b
--- /dev/null
+++ b/shaders-msl/frag/fragment-component-padding.pad-fragment.frag
@@ -0,0 +1,18 @@
+#version 450
+layout(location = 0) out float FragColors[2];
+layout(location = 2) out vec2 FragColor2;
+layout(location = 3) out vec3 FragColor3;
+layout(location = 0) in vec3 vColor;
+
+void set_globals()
+{
+	FragColors[0] = vColor.x;
+	FragColors[1] = vColor.y;
+	FragColor2 = vColor.xz;
+	FragColor3 = vColor.zzz;
+}
+
+void main()
+{
+	set_globals();
+}
diff --git a/shaders-no-opt/vert/pass-array-by-value.vert b/shaders-no-opt/vert/pass-array-by-value.vert
new file mode 100644
index 0000000..2c142a7
--- /dev/null
+++ b/shaders-no-opt/vert/pass-array-by-value.vert
@@ -0,0 +1,26 @@
+#version 310 es
+
+layout(location = 0) in int Index1;
+layout(location = 1) in int Index2;
+
+vec4 consume_constant_arrays2(const vec4 positions[4], const vec4 positions2[4])
+{
+	return positions[Index1] + positions2[Index2];
+}
+
+vec4 consume_constant_arrays(const vec4 positions[4], const vec4 positions2[4])
+{
+	return consume_constant_arrays2(positions, positions2);
+}
+
+const vec4 LUT1[] = vec4[](vec4(0.0), vec4(1.0), vec4(2.0), vec4(3.0));
+
+void main()
+{
+	vec4 LUT2[4];
+	LUT2[0] = vec4(10.0);
+	LUT2[1] = vec4(11.0);
+	LUT2[2] = vec4(12.0);
+	LUT2[3] = vec4(13.0);
+	gl_Position = consume_constant_arrays(LUT1, LUT2);
+}
diff --git a/spirv_common.hpp b/spirv_common.hpp
index c106008..57820d0 100644
--- a/spirv_common.hpp
+++ b/spirv_common.hpp
@@ -812,6 +812,11 @@
 	// Need to defer this, because they might rely on things which change during compilation.
 	std::vector<std::function<void()>> fixup_hooks_in;
 
+	// On function entry, make sure to copy a constant array into thread addr space to work around
+	// the case where we are passing a constant array by value to a function on backends which do not
+	// consider arrays value types.
+	std::vector<uint32_t> constant_arrays_needed_on_stack;
+
 	bool active = false;
 	bool flush_undeclared = true;
 	bool do_combined_parameters = true;
diff --git a/spirv_glsl.cpp b/spirv_glsl.cpp
index d9d04e1..1a3f08c 100644
--- a/spirv_glsl.cpp
+++ b/spirv_glsl.cpp
@@ -9605,6 +9605,14 @@
 	current_function = &func;
 	auto &entry_block = get<SPIRBlock>(func.entry_block);
 
+	sort(begin(func.constant_arrays_needed_on_stack), end(func.constant_arrays_needed_on_stack));
+	for (auto &array : func.constant_arrays_needed_on_stack)
+	{
+		auto &c = get<SPIRConstant>(array);
+		auto &type = get<SPIRType>(c.constant_type);
+		statement(variable_decl(type, join("_", array, "_array_copy")), " = ", constant_expression(c), ";");
+	}
+
 	for (auto &v : func.local_variables)
 	{
 		auto &var = get<SPIRVariable>(v);
diff --git a/spirv_msl.cpp b/spirv_msl.cpp
index 0041491..4f81ae3 100644
--- a/spirv_msl.cpp
+++ b/spirv_msl.cpp
@@ -82,6 +82,11 @@
 			resource_bindings.push_back(&p_res_bindings[i]);
 }
 
+void CompilerMSL::set_fragment_output_components(uint32_t location, uint32_t components)
+{
+	fragment_output_components[location] = components;
+}
+
 void CompilerMSL::build_implicit_builtins()
 {
 	bool need_sample_pos = active_input_builtins.get(BuiltInSamplePosition);
@@ -779,6 +784,27 @@
 		p_va->used_by_shader = true;
 }
 
+uint32_t CompilerMSL::get_target_components_for_fragment_location(uint32_t location) const
+{
+	auto itr = fragment_output_components.find(location);
+	if (itr == end(fragment_output_components))
+		return 4;
+	else
+		return itr->second;
+}
+
+uint32_t CompilerMSL::build_extended_vector_type(uint32_t type_id, uint32_t components)
+{
+	uint32_t new_type_id = ir.increase_bound_by(1);
+	auto &type = set<SPIRType>(new_type_id, get<SPIRType>(type_id));
+	type.vecsize = components;
+	type.self = new_type_id;
+	type.parent_type = 0;
+	type.pointer = false;
+
+	return new_type_id;
+}
+
 void CompilerMSL::add_plain_variable_to_interface_block(StorageClass storage, const string &ib_var_ref,
                                                         SPIRType &ib_type, SPIRVariable &var)
 {
@@ -793,6 +819,26 @@
 	uint32_t ib_mbr_idx = uint32_t(ib_type.member_types.size());
 	uint32_t type_id = ensure_correct_builtin_type(var.basetype, builtin);
 	var.basetype = type_id;
+
+	auto &type = get<SPIRType>(type_id);
+	uint32_t target_components = 0;
+	uint32_t type_components = type.vecsize;
+	bool padded_output = false;
+
+	// Check if we need to pad fragment output to match a certain number of components.
+	if (get_decoration_bitset(var.self).get(DecorationLocation) && msl_options.pad_fragment_output_components &&
+	    get_entry_point().model == ExecutionModelFragment && storage == StorageClassOutput)
+	{
+		uint32_t locn = get_decoration(var.self, DecorationLocation);
+		target_components = get_target_components_for_fragment_location(locn);
+		if (type_components < target_components)
+		{
+			// Make a new type here.
+			type_id = build_extended_vector_type(type_id, target_components);
+			padded_output = true;
+		}
+	}
+
 	ib_type.member_types.push_back(get_pointee_type_id(type_id));
 
 	// Give the member a name
@@ -801,7 +847,20 @@
 
 	// Update the original variable reference to include the structure reference
 	string qual_var_name = ib_var_ref + "." + mbr_name;
-	ir.meta[var.self].decoration.qualified_alias = qual_var_name;
+
+	if (padded_output)
+	{
+		auto &entry_func = get<SPIRFunction>(ir.default_entry_point);
+		entry_func.add_local_variable(var.self);
+		vars_needing_early_declaration.push_back(var.self);
+
+		entry_func.fixup_hooks_out.push_back([=, &var]() {
+			SPIRType &padded_type = this->get<SPIRType>(type_id);
+			statement(qual_var_name, " = ", remap_swizzle(padded_type, type_components, to_name(var.self)), ";");
+		});
+	}
+	else
+		ir.meta[var.self].decoration.qualified_alias = qual_var_name;
 
 	// Copy the variable location from the original variable to the member
 	if (get_decoration_bitset(var.self).get(DecorationLocation))
@@ -890,7 +949,26 @@
 	{
 		// Add a reference to the variable type to the interface struct.
 		uint32_t ib_mbr_idx = uint32_t(ib_type.member_types.size());
-		ib_type.member_types.push_back(usable_type->self);
+
+		uint32_t target_components = 0;
+		bool padded_output = false;
+		uint32_t type_id = usable_type->self;
+
+		// Check if we need to pad fragment output to match a certain number of components.
+		if (get_decoration_bitset(var.self).get(DecorationLocation) && msl_options.pad_fragment_output_components &&
+		    get_entry_point().model == ExecutionModelFragment && storage == StorageClassOutput)
+		{
+			uint32_t locn = get_decoration(var.self, DecorationLocation) + i;
+			target_components = get_target_components_for_fragment_location(locn);
+			if (usable_type->vecsize < target_components)
+			{
+				// Make a new type here.
+				type_id = build_extended_vector_type(usable_type->self, target_components);
+				padded_output = true;
+			}
+		}
+
+		ib_type.member_types.push_back(get_pointee_type_id(type_id));
 
 		// Give the member a name
 		string mbr_name = ensure_valid_name(join(to_expression(var.self), "_", i), "m");
@@ -930,12 +1008,21 @@
 		{
 		case StorageClassInput:
 			entry_func.fixup_hooks_in.push_back(
-			    [=]() { statement(to_name(var.self), "[", i, "] = ", ib_var_ref, ".", mbr_name, ";"); });
+			    [=, &var]() { statement(to_name(var.self), "[", i, "] = ", ib_var_ref, ".", mbr_name, ";"); });
 			break;
 
 		case StorageClassOutput:
-			entry_func.fixup_hooks_out.push_back(
-			    [=]() { statement(ib_var_ref, ".", mbr_name, " = ", to_name(var.self), "[", i, "];"); });
+			entry_func.fixup_hooks_out.push_back([=, &var]() {
+				if (padded_output)
+				{
+					auto &padded_type = this->get<SPIRType>(type_id);
+					statement(ib_var_ref, ".", mbr_name, " = ",
+					          remap_swizzle(padded_type, usable_type->vecsize, join(to_name(var.self), "[", i, "]")),
+					          ";");
+				}
+				else
+					statement(ib_var_ref, ".", mbr_name, " = ", to_name(var.self), "[", i, "];");
+			});
 			break;
 
 		default:
@@ -1053,14 +1140,14 @@
 		switch (storage)
 		{
 		case StorageClassInput:
-			entry_func.fixup_hooks_in.push_back([=]() {
+			entry_func.fixup_hooks_in.push_back([=, &var, &var_type]() {
 				statement(to_name(var.self), ".", to_member_name(var_type, mbr_idx), "[", i, "] = ", ib_var_ref, ".",
 				          mbr_name, ";");
 			});
 			break;
 
 		case StorageClassOutput:
-			entry_func.fixup_hooks_out.push_back([=]() {
+			entry_func.fixup_hooks_out.push_back([=, &var, &var_type]() {
 				statement(ib_var_ref, ".", mbr_name, " = ", to_name(var.self), ".", to_member_name(var_type, mbr_idx),
 				          "[", i, "];");
 			});
@@ -1115,13 +1202,13 @@
 		switch (storage)
 		{
 		case StorageClassInput:
-			entry_func.fixup_hooks_in.push_back([=]() {
+			entry_func.fixup_hooks_in.push_back([=, &var, &var_type]() {
 				statement(to_name(var.self), ".", to_member_name(var_type, mbr_idx), " = ", qual_var_name, ";");
 			});
 			break;
 
 		case StorageClassOutput:
-			entry_func.fixup_hooks_out.push_back([=]() {
+			entry_func.fixup_hooks_out.push_back([=, &var, &var_type]() {
 				statement(qual_var_name, " = ", to_name(var.self), ".", to_member_name(var_type, mbr_idx), ";");
 			});
 			break;
@@ -3684,7 +3771,31 @@
 // Manufacture automatic sampler arg for SampledImage texture.
 string CompilerMSL::to_func_call_arg(uint32_t id)
 {
-	string arg_str = CompilerGLSL::to_func_call_arg(id);
+	string arg_str;
+
+	auto *c = maybe_get<SPIRConstant>(id);
+	if (c && !get<SPIRType>(c->constant_type).array.empty())
+	{
+		// If we are passing a constant array directly to a function for some reason,
+		// the callee will expect an argument in thread const address space
+		// (since we can only bind to arrays with references in MSL).
+		// To resolve this, we must emit a copy in this address space.
+		// This kind of code gen should be rare enough that performance is not a real concern.
+		// Inline the SPIR-V to avoid this kind of suboptimal codegen.
+		//
+		// We risk calling this inside a continue block (invalid code),
+		// so just create a thread local copy in the current function.
+		arg_str = join("_", id, "_array_copy");
+		auto &constants = current_function->constant_arrays_needed_on_stack;
+		auto itr = find(begin(constants), end(constants), id);
+		if (itr == end(constants))
+		{
+			force_recompile = true;
+			constants.push_back(id);
+		}
+	}
+	else
+		arg_str = CompilerGLSL::to_func_call_arg(id);
 
 	// Manufacture automatic sampler arg if the arg is a SampledImage texture.
 	auto &type = expression_type(id);
@@ -4514,8 +4625,26 @@
 	    (storage == StorageClassFunction || storage == StorageClassGeneric))
 	{
 		// If the argument is a pure value and not an opaque type, we will pass by value.
-		decl += " ";
-		decl += to_expression(name_id);
+		if (is_array(type))
+		{
+			// We are receiving an array by value. This is problematic.
+			// We cannot be sure of the target address space since we are supposed to receive a copy,
+			// but this is not possible with MSL without some extra work.
+			// We will have to assume we're getting a reference in thread address space.
+			// If we happen to get a reference in constant address space, the caller must emit a copy and pass that.
+			// Thread const therefore becomes the only logical choice, since we cannot "create" a constant array from
+			// non-constant arrays, but we can create thread const from constant.
+			decl = string("thread const ") + decl;
+			decl += " (&";
+			decl += to_expression(name_id);
+			decl += ")";
+			decl += type_to_array_glsl(type);
+		}
+		else
+		{
+			decl += " ";
+			decl += to_expression(name_id);
+		}
 	}
 	else if (is_array(type) && !type_is_image)
 	{
@@ -4581,12 +4710,254 @@
 	// FIXME: MSL and GLSL are doing two different things here.
 	// Agree on convention and remove this override.
 	static const unordered_set<string> keywords = {
-		"kernel", "vertex", "fragment", "compute", "bias",
+		"kernel",
+		"vertex",
+		"fragment",
+		"compute",
+		"bias",
+		"assert",
+		"VARIABLE_TRACEPOINT",
+		"STATIC_DATA_TRACEPOINT",
+		"STATIC_DATA_TRACEPOINT_V",
+		"METAL_ALIGN",
+		"METAL_ASM",
+		"METAL_CONST",
+		"METAL_DEPRECATED",
+		"METAL_ENABLE_IF",
+		"METAL_FUNC",
+		"METAL_INTERNAL",
+		"METAL_NON_NULL_RETURN",
+		"METAL_NORETURN",
+		"METAL_NOTHROW",
+		"METAL_PURE",
+		"METAL_UNAVAILABLE",
+		"METAL_IMPLICIT",
+		"METAL_EXPLICIT",
+		"METAL_CONST_ARG",
+		"METAL_ARG_UNIFORM",
+		"METAL_ZERO_ARG",
+		"METAL_VALID_LOD_ARG",
+		"METAL_VALID_LEVEL_ARG",
+		"METAL_VALID_STORE_ORDER",
+		"METAL_VALID_LOAD_ORDER",
+		"METAL_VALID_COMPARE_EXCHANGE_FAILURE_ORDER",
+		"METAL_COMPATIBLE_COMPARE_EXCHANGE_ORDERS",
+		"METAL_VALID_RENDER_TARGET",
+		"is_function_constant_defined",
+		"CHAR_BIT",
+		"SCHAR_MAX",
+		"SCHAR_MIN",
+		"UCHAR_MAX",
+		"CHAR_MAX",
+		"CHAR_MIN",
+		"USHRT_MAX",
+		"SHRT_MAX",
+		"SHRT_MIN",
+		"UINT_MAX",
+		"INT_MAX",
+		"INT_MIN",
+		"FLT_DIG",
+		"FLT_MANT_DIG",
+		"FLT_MAX_10_EXP",
+		"FLT_MAX_EXP",
+		"FLT_MIN_10_EXP",
+		"FLT_MIN_EXP",
+		"FLT_RADIX",
+		"FLT_MAX",
+		"FLT_MIN",
+		"FLT_EPSILON",
+		"FP_ILOGB0",
+		"FP_ILOGBNAN",
+		"MAXFLOAT",
+		"HUGE_VALF",
+		"INFINITY",
+		"NAN",
+		"M_E_F",
+		"M_LOG2E_F",
+		"M_LOG10E_F",
+		"M_LN2_F",
+		"M_LN10_F",
+		"M_PI_F",
+		"M_PI_2_F",
+		"M_PI_4_F",
+		"M_1_PI_F",
+		"M_2_PI_F",
+		"M_2_SQRTPI_F",
+		"M_SQRT2_F",
+		"M_SQRT1_2_F",
+		"HALF_DIG",
+		"HALF_MANT_DIG",
+		"HALF_MAX_10_EXP",
+		"HALF_MAX_EXP",
+		"HALF_MIN_10_EXP",
+		"HALF_MIN_EXP",
+		"HALF_RADIX",
+		"HALF_MAX",
+		"HALF_MIN",
+		"HALF_EPSILON",
+		"MAXHALF",
+		"HUGE_VALH",
+		"M_E_H",
+		"M_LOG2E_H",
+		"M_LOG10E_H",
+		"M_LN2_H",
+		"M_LN10_H",
+		"M_PI_H",
+		"M_PI_2_H",
+		"M_PI_4_H",
+		"M_1_PI_H",
+		"M_2_PI_H",
+		"M_2_SQRTPI_H",
+		"M_SQRT2_H",
+		"M_SQRT1_2_H",
+		"DBL_DIG",
+		"DBL_MANT_DIG",
+		"DBL_MAX_10_EXP",
+		"DBL_MAX_EXP",
+		"DBL_MIN_10_EXP",
+		"DBL_MIN_EXP",
+		"DBL_RADIX",
+		"DBL_MAX",
+		"DBL_MIN",
+		"DBL_EPSILON",
+		"HUGE_VAL",
+		"M_E",
+		"M_LOG2E",
+		"M_LOG10E",
+		"M_LN2",
+		"M_LN10",
+		"M_PI",
+		"M_PI_2",
+		"M_PI_4",
+		"M_1_PI",
+		"M_2_PI",
+		"M_2_SQRTPI",
+		"M_SQRT2",
+		"M_SQRT1_2",
 	};
 
 	static const unordered_set<string> illegal_func_names = {
 		"main",
 		"saturate",
+		"assert",
+		"VARIABLE_TRACEPOINT",
+		"STATIC_DATA_TRACEPOINT",
+		"STATIC_DATA_TRACEPOINT_V",
+		"METAL_ALIGN",
+		"METAL_ASM",
+		"METAL_CONST",
+		"METAL_DEPRECATED",
+		"METAL_ENABLE_IF",
+		"METAL_FUNC",
+		"METAL_INTERNAL",
+		"METAL_NON_NULL_RETURN",
+		"METAL_NORETURN",
+		"METAL_NOTHROW",
+		"METAL_PURE",
+		"METAL_UNAVAILABLE",
+		"METAL_IMPLICIT",
+		"METAL_EXPLICIT",
+		"METAL_CONST_ARG",
+		"METAL_ARG_UNIFORM",
+		"METAL_ZERO_ARG",
+		"METAL_VALID_LOD_ARG",
+		"METAL_VALID_LEVEL_ARG",
+		"METAL_VALID_STORE_ORDER",
+		"METAL_VALID_LOAD_ORDER",
+		"METAL_VALID_COMPARE_EXCHANGE_FAILURE_ORDER",
+		"METAL_COMPATIBLE_COMPARE_EXCHANGE_ORDERS",
+		"METAL_VALID_RENDER_TARGET",
+		"is_function_constant_defined",
+		"CHAR_BIT",
+		"SCHAR_MAX",
+		"SCHAR_MIN",
+		"UCHAR_MAX",
+		"CHAR_MAX",
+		"CHAR_MIN",
+		"USHRT_MAX",
+		"SHRT_MAX",
+		"SHRT_MIN",
+		"UINT_MAX",
+		"INT_MAX",
+		"INT_MIN",
+		"FLT_DIG",
+		"FLT_MANT_DIG",
+		"FLT_MAX_10_EXP",
+		"FLT_MAX_EXP",
+		"FLT_MIN_10_EXP",
+		"FLT_MIN_EXP",
+		"FLT_RADIX",
+		"FLT_MAX",
+		"FLT_MIN",
+		"FLT_EPSILON",
+		"FP_ILOGB0",
+		"FP_ILOGBNAN",
+		"MAXFLOAT",
+		"HUGE_VALF",
+		"INFINITY",
+		"NAN",
+		"M_E_F",
+		"M_LOG2E_F",
+		"M_LOG10E_F",
+		"M_LN2_F",
+		"M_LN10_F",
+		"M_PI_F",
+		"M_PI_2_F",
+		"M_PI_4_F",
+		"M_1_PI_F",
+		"M_2_PI_F",
+		"M_2_SQRTPI_F",
+		"M_SQRT2_F",
+		"M_SQRT1_2_F",
+		"HALF_DIG",
+		"HALF_MANT_DIG",
+		"HALF_MAX_10_EXP",
+		"HALF_MAX_EXP",
+		"HALF_MIN_10_EXP",
+		"HALF_MIN_EXP",
+		"HALF_RADIX",
+		"HALF_MAX",
+		"HALF_MIN",
+		"HALF_EPSILON",
+		"MAXHALF",
+		"HUGE_VALH",
+		"M_E_H",
+		"M_LOG2E_H",
+		"M_LOG10E_H",
+		"M_LN2_H",
+		"M_LN10_H",
+		"M_PI_H",
+		"M_PI_2_H",
+		"M_PI_4_H",
+		"M_1_PI_H",
+		"M_2_PI_H",
+		"M_2_SQRTPI_H",
+		"M_SQRT2_H",
+		"M_SQRT1_2_H",
+		"DBL_DIG",
+		"DBL_MANT_DIG",
+		"DBL_MAX_10_EXP",
+		"DBL_MAX_EXP",
+		"DBL_MIN_10_EXP",
+		"DBL_MIN_EXP",
+		"DBL_RADIX",
+		"DBL_MAX",
+		"DBL_MIN",
+		"DBL_EPSILON",
+		"HUGE_VAL",
+		"M_E",
+		"M_LOG2E",
+		"M_LOG10E",
+		"M_LN2",
+		"M_LN10",
+		"M_PI",
+		"M_PI_2",
+		"M_PI_4",
+		"M_1_PI",
+		"M_2_PI",
+		"M_2_SQRTPI",
+		"M_SQRT2",
+		"M_SQRT1_2",
 	};
 
 	ir.for_each_typed_id<SPIRVariable>([&](uint32_t self, SPIRVariable &) {
diff --git a/spirv_msl.hpp b/spirv_msl.hpp
index eff1b25..f4520dd 100644
--- a/spirv_msl.hpp
+++ b/spirv_msl.hpp
@@ -168,6 +168,10 @@
 		bool disable_rasterization = false;
 		bool swizzle_texture_samples = false;
 
+		// Fragment output in MSL must have at least as many components as the render pass.
+		// Add support to explicit pad out components.
+		bool pad_fragment_output_components = false;
+
 		bool is_ios()
 		{
 			return platform == iOS;
@@ -312,6 +316,10 @@
 	// The remapped sampler must not be an array of samplers.
 	void remap_constexpr_sampler(uint32_t id, const MSLConstexprSampler &sampler);
 
+	// If using CompilerMSL::Options::pad_fragment_output_components, override the number of components we expect
+	// to use for a particular location. The default is 4 if number of components is not overridden.
+	void set_fragment_output_components(uint32_t location, uint32_t components);
+
 protected:
 	void emit_binary_unord_op(uint32_t result_type, uint32_t result_id, uint32_t op0, uint32_t op1, const char *op);
 	void emit_instruction(const Instruction &instr) override;
@@ -428,6 +436,7 @@
 	Options msl_options;
 	std::set<SPVFuncImpl> spv_function_implementations;
 	std::unordered_map<uint32_t, MSLVertexAttr *> vtx_attrs_by_location;
+	std::unordered_map<uint32_t, uint32_t> fragment_output_components;
 	std::unordered_map<MSLStructMemberKey, uint32_t> struct_member_padding;
 	std::set<std::string> pragma_lines;
 	std::set<std::string> typedef_lines;
@@ -451,6 +460,9 @@
 	std::unordered_map<uint32_t, MSLConstexprSampler> constexpr_samplers;
 	std::vector<uint32_t> buffer_arrays;
 
+	uint32_t get_target_components_for_fragment_location(uint32_t location) const;
+	uint32_t build_extended_vector_type(uint32_t type_id, uint32_t components);
+
 	// OpcodeHandler that handles several MSL preprocessing operations.
 	struct OpCodePreprocessor : OpcodeHandler
 	{
diff --git a/test_shaders.py b/test_shaders.py
index 045c255..92fe5c4 100755
--- a/test_shaders.py
+++ b/test_shaders.py
@@ -129,7 +129,7 @@
             raise
     except subprocess.CalledProcessError:
         print('Error compiling Metal shader: ' + msl_path)
-        sys.exit(1)
+        raise RuntimeError('Failed to compile Metal shader')
 
 def cross_compile_msl(shader, spirv, opt):
     spirv_path = create_temporary()
@@ -152,6 +152,8 @@
         msl_args.append('--msl-swizzle-texture-samples')
     if '.ios.' in shader:
         msl_args.append('--msl-ios')
+    if '.pad-fragment.' in shader:
+        msl_args.append('--msl-pad-fragment-output')
 
     subprocess.check_call(msl_args)
 
@@ -201,12 +203,15 @@
             subprocess.check_call(['fxc', '-nologo', shader_model_hlsl(shader), win_path])
         except OSError as oe:
             if (oe.errno != errno.ENOENT): # Ignore not found errors
+                print('Failed to run FXC.')
+                ignore_fxc = True
                 raise
             else:
+                print('Could not find FXC.')
                 ignore_fxc = True
         except subprocess.CalledProcessError:
             print('Failed compiling HLSL shader:', shader, 'with FXC.')
-            sys.exit(1)
+            raise RuntimeError('Failed compiling HLSL shader')
 
 def shader_to_sm(shader):
     if '.sm60.' in shader:
@@ -382,7 +387,8 @@
                 # Otherwise, fail the test. Keep the shader file around so we can inspect.
                 if not keep:
                     remove_file(json_file)
-                sys.exit(1)
+
+                raise RuntimeError('Does not match reference')
         else:
             remove_file(json_file)
     else:
@@ -417,7 +423,7 @@
                 # Otherwise, fail the test. Keep the shader file around so we can inspect.
                 if not keep:
                     remove_file(glsl)
-                sys.exit(1)
+                raise RuntimeError('Does not match reference')
         else:
             remove_file(glsl)
     else:
@@ -533,14 +539,18 @@
     remove_file(spirv)
 
 def test_shader_file(relpath, stats, shader_dir, update, keep, opt, force_no_external_validation, backend):
-    if backend == 'msl':
-        test_shader_msl(stats, (shader_dir, relpath), update, keep, opt, force_no_external_validation)
-    elif backend == 'hlsl':
-        test_shader_hlsl(stats, (shader_dir, relpath), update, keep, opt, force_no_external_validation)
-    elif backend == 'reflect':
-        test_shader_reflect(stats, (shader_dir, relpath), update, keep, opt)
-    else:
-        test_shader(stats, (shader_dir, relpath), update, keep, opt)
+    try:
+        if backend == 'msl':
+            test_shader_msl(stats, (shader_dir, relpath), update, keep, opt, force_no_external_validation)
+        elif backend == 'hlsl':
+            test_shader_hlsl(stats, (shader_dir, relpath), update, keep, opt, force_no_external_validation)
+        elif backend == 'reflect':
+            test_shader_reflect(stats, (shader_dir, relpath), update, keep, opt)
+        else:
+            test_shader(stats, (shader_dir, relpath), update, keep, opt)
+        return None
+    except Exception as e:
+        return e
 
 def test_shaders_helper(stats, backend, args):
     all_files = []
@@ -555,17 +565,27 @@
     # at this point we need to switch to explicit arguments
     if args.parallel:
         pool = multiprocessing.Pool(multiprocessing.cpu_count())
-        pool.map(partial(test_shader_file,
-            stats = stats,
-            shader_dir = args.folder,
-            update = args.update,
-            keep = args.keep,
-            opt = args.opt,
-            force_no_external_validation = args.force_no_external_validation,
-            backend = backend), all_files)
+
+        results = []
+        for f in all_files:
+            results.append(pool.apply_async(test_shader_file,
+                args = (f, stats,
+                args.folder, args.update, args.keep, args.opt, args.force_no_external_validation,
+                backend)))
+
+        for res in results:
+            error = res.get()
+            if error is not None:
+                pool.close()
+                pool.join()
+                print('Error:', error)
+                sys.exit(1)
     else:
         for i in all_files:
-            test_shader_file(i, stats, args.folder, args.update, args.keep, args.opt, args.force_no_external_validation, backend)
+            e = test_shader_file(i, stats, args.folder, args.update, args.keep, args.opt, args.force_no_external_validation, backend)
+            if e is not None:
+                print('Error:', e)
+                sys.exit(1)
 
 def test_shaders(backend, args):
     if args.malisc: