src/panfrost/bifrost/bifrost_sched.c - third_party/mesa - Git at Google

 /*
  * Copyright (C) 2019 Ryan Houdek <Sonicadvance1@gmail.com>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */

 #include "util/register_allocate.h"
 #include "compiler_defines.h"
 #include "bifrost_sched.h"
 #include "bifrost_compile.h"
 #include "bifrost_print.h"

 #define BI_DEBUG
 const unsigned max_primary_reg = 64; /* Overestimate because of special regs */
 const unsigned max_vec2_reg = 32;
 const unsigned max_vec3_reg = 16; // XXX: Do we need to align vec3 to vec4 boundary?
 const unsigned max_vec4_reg = 16;
 const unsigned max_registers = 128; /* Sum of classes */
 const unsigned primary_base = 0;
 const unsigned vec2_base = 64;
 const unsigned vec3_base = 96; /* above base + max_class_reg */
 const unsigned vec4_base = 112;
 const unsigned vec4_end = 128;

 static unsigned
 find_or_allocate_temp(compiler_context *ctx, unsigned hash)
 {
         if (hash >= SSA_FIXED_MINIMUM)
                 return hash;

         unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(ctx->hash_to_temp, hash + 1);

         if (temp)
                 return temp - 1;

         /* If no temp is find, allocate one */
         temp = ctx->num_temps++;
         ctx->max_hash = MAX2(ctx->max_hash, hash);

         _mesa_hash_table_u64_insert(ctx->hash_to_temp, hash + 1, (void *) ((uintptr_t) temp + 1));

         return temp;
 }

 static bool
 is_live_in_instr(bifrost_instruction *instr, unsigned temp)
 {
         if (instr->ssa_args.src0 == temp) return true;
         if (instr->ssa_args.src1 == temp) return true;
         if (instr->ssa_args.src2 == temp) return true;
         if (instr->ssa_args.src3 == temp) return true;

         return false;
 }

 static bool
 is_live_after_instr(compiler_context *ctx, bifrost_block *blk, bifrost_instruction *instr, unsigned temp)
 {
         // Scan forward in the block from this location to see if we are still live.

         mir_foreach_instr_in_block_from(blk, ins, mir_next_instr(instr)) {
                 if (is_live_in_instr(ins, temp))
                         return true;
         }

         // XXX: Walk all successor blocks and ensure the value isn't used there

         return false;
 }

 static uint32_t
 ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data)
 {
         for (int i = primary_base; i < vec4_end; ++i) {
                 if (BITSET_TEST(regs, i)) {
                         return i;
                 }
         }

         assert(0);
         return 0;
 }

 static uint32_t
 ra_get_phys_reg(compiler_context *ctx, struct ra_graph *g, unsigned temp, unsigned max_reg)
 {
         if (temp == SSA_INVALID_VALUE ||
             temp >= SSA_FIXED_UREG_MINIMUM ||
             temp == SSA_FIXED_CONST_0)
                 return temp;

         if (temp >= SSA_FIXED_MINIMUM)
                 return SSA_REG_FROM_FIXED(temp);

         assert(temp < max_reg);
         uint32_t r = ra_get_node_reg(g, temp);
         if (r >= vec4_base)
                 return (r - vec4_base) * 4;
         else if (r >= vec3_base)
                 return (r - vec3_base) * 4;
         else if (r >= vec2_base)
                 return (r - vec2_base) * 2;

         return r;
 }

 static void
 allocate_registers(compiler_context *ctx)
 {
         struct ra_regs *regs = ra_alloc_reg_set(NULL, max_registers, true);

         int primary_class = ra_alloc_reg_class(regs);
         int vec2_class = ra_alloc_reg_class(regs);
         int vec3_class = ra_alloc_reg_class(regs);
         int vec4_class = ra_alloc_reg_class(regs);

         // Allocate our register classes and conflicts
         {
                 unsigned reg = 0;
                 unsigned primary_base = 0;

                 // Add all of our primary scalar registers
                 for (unsigned i = 0; i < max_primary_reg; ++i) {
                         ra_class_add_reg(regs, primary_class, reg);
                         reg++;
                 }

                 // Add all of our vec2 class registers
                 // These alias with the scalar registers
                 for (unsigned i = 0; i < max_vec2_reg; ++i) {
                         ra_class_add_reg(regs, vec2_class, reg);

                         // Tell RA that this conflicts with primary class registers
                         // Make sure to tell the RA utility all conflict slots
                         ra_add_reg_conflict(regs, reg, primary_base + i*2 + 0);
                         ra_add_reg_conflict(regs, reg, primary_base + i*2 + 1);

                         reg++;
                 }

                 // Add all of our vec3 class registers
                 // These alias with the scalar registers
                 for (unsigned i = 0; i < max_vec3_reg; ++i) {
                         ra_class_add_reg(regs, vec3_class, reg);

                         // Tell RA that this conflicts with primary class registers
                         // Make sure to tell the RA utility all conflict slots
                         // These are aligned to vec4 even though they only conflict with a vec3 wide slot
                         ra_add_reg_conflict(regs, reg, primary_base + i*4 + 0);
                         ra_add_reg_conflict(regs, reg, primary_base + i*4 + 1);
                         ra_add_reg_conflict(regs, reg, primary_base + i*4 + 2);

                         // State that this class conflicts with the vec2 class
                         ra_add_reg_conflict(regs, reg, vec2_base + i*2 + 0);
                         ra_add_reg_conflict(regs, reg, vec2_base + i*2 + 1);

                         reg++;
                 }

                 // Add all of our vec4 class registers
                 // These alias with the scalar registers
                 for (unsigned i = 0; i < max_vec4_reg; ++i) {
                         ra_class_add_reg(regs, vec4_class, reg);

                         // Tell RA that this conflicts with primary class registers
                         // Make sure to tell the RA utility all conflict slots
                         // These are aligned to vec4 even though they only conflict with a vec3 wide slot
                         ra_add_reg_conflict(regs, reg, primary_base + i*4 + 0);
                         ra_add_reg_conflict(regs, reg, primary_base + i*4 + 1);
                         ra_add_reg_conflict(regs, reg, primary_base + i*4 + 2);
                         ra_add_reg_conflict(regs, reg, primary_base + i*4 + 3);

                         // State that this class conflicts with the vec2 class
                         ra_add_reg_conflict(regs, reg, vec2_base + i*2 + 0);
                         ra_add_reg_conflict(regs, reg, vec2_base + i*2 + 1);

                         // State that this class conflicts with the vec3 class
                         // They conflict on the exact same location due to alignments
                         ra_add_reg_conflict(regs, reg, vec3_base + i);

                         reg++;
                 }
         }

         ra_set_finalize(regs, NULL);
         mir_foreach_block(ctx, block) {
                 mir_foreach_instr_in_block(block, instr) {
                         instr->ssa_args.src0 = find_or_allocate_temp(ctx, instr->ssa_args.src0);
                         instr->ssa_args.src1 = find_or_allocate_temp(ctx, instr->ssa_args.src1);
                         instr->ssa_args.src2 = find_or_allocate_temp(ctx, instr->ssa_args.src2);
                         instr->ssa_args.src3 = find_or_allocate_temp(ctx, instr->ssa_args.src3);
                         instr->ssa_args.dest = find_or_allocate_temp(ctx, instr->ssa_args.dest);
                 }
         }

         uint32_t nodes = ctx->num_temps;
         struct ra_graph *g = ra_alloc_interference_graph(regs, nodes);

         mir_foreach_block(ctx, block) {
                 mir_foreach_instr_in_block(block, instr) {
                         if (instr->ssa_args.dest >= SSA_FIXED_MINIMUM) continue;
                         if (instr->dest_components == 4)
                                 ra_set_node_class(g, instr->ssa_args.dest, vec4_class);
                         else if (instr->dest_components == 3)
                                 ra_set_node_class(g, instr->ssa_args.dest, vec3_class);
                         else if (instr->dest_components == 2)
                                 ra_set_node_class(g, instr->ssa_args.dest, vec2_class);
                         else
                                 ra_set_node_class(g, instr->ssa_args.dest, primary_class);
                 }
         }

         uint32_t *live_start = malloc(nodes * sizeof(uint32_t));
         uint32_t *live_end = malloc(nodes * sizeof(uint32_t));

         memset(live_start, 0xFF, nodes * sizeof(uint32_t));
         memset(live_end, 0xFF, nodes * sizeof(uint32_t));

         uint32_t location = 0;
         mir_foreach_block(ctx, block) {
                 mir_foreach_instr_in_block(block, instr) {
                         if (instr->ssa_args.dest < SSA_FIXED_MINIMUM) {
                                 // If the destination isn't yet live before this point
                                 // then this is the point it becomes live since we wrote to it
                                 if (live_start[instr->ssa_args.dest] == ~0U) {
                                         live_start[instr->ssa_args.dest] = location;
                                 }
                         }

                         uint32_t sources[4] = {
                                 instr->ssa_args.src0,
                                 instr->ssa_args.src1,
                                 instr->ssa_args.src2,
                                 instr->ssa_args.src3,
                         };

                         for (unsigned i = 0; i < 4; ++i) {
                                 if (sources[i] >= SSA_FIXED_MINIMUM)
                                         continue;

                                 // If the source is no longer live after this instruction then we can end its liveness
                                 if (!is_live_after_instr(ctx, block, instr, sources[i])) {
                                         live_end[sources[i]] = location;
                                 }
                         }
                         ++location;
                 }
         }

         // Spin through the nodes quick and ensure they are all killed by the end of the program
         for (unsigned i = 0; i < nodes; ++i) {
                 if (live_end[i] == ~0U)
                         live_end[i] = location;
         }

         for (int i = 0; i < nodes; ++i) {
                 for (int j = i + 1; j < nodes; ++j) {
                         if (!(live_start[i] >= live_end[j] || live_start[j] >= live_end[i])) {
                                 ra_add_node_interference(g, i, j);
                         }
                 }
         }

         ra_set_select_reg_callback(g, ra_select_callback, NULL);

         if (!ra_allocate(g)) {
                 assert(0);
         }

         free(live_start);
         free(live_end);

         mir_foreach_block(ctx, block) {
                 mir_foreach_instr_in_block(block, instr) {
                         instr->args.src0 = ra_get_phys_reg(ctx, g, instr->ssa_args.src0, nodes);
                         instr->args.src1 = ra_get_phys_reg(ctx, g, instr->ssa_args.src1, nodes);
                         instr->args.src2 = ra_get_phys_reg(ctx, g, instr->ssa_args.src2, nodes);
                         instr->args.src3 = ra_get_phys_reg(ctx, g, instr->ssa_args.src3, nodes);
                         instr->args.dest = ra_get_phys_reg(ctx, g, instr->ssa_args.dest, nodes);
                 }
         }
 }

 static void
 bundle_block(compiler_context *ctx, bifrost_block *block)
 {
 }

 static void
 remove_create_vectors(compiler_context *ctx, bifrost_block *block)
 {
         mir_foreach_instr_in_block_safe(block, instr) {
                 if (instr->op != op_create_vector) continue;

                 uint32_t vector_ssa_sources[4] = {
                         instr->ssa_args.src0,
                         instr->ssa_args.src1,
                         instr->ssa_args.src2,
                         instr->ssa_args.src3,
                 };

                 mir_foreach_instr_in_block_from_rev(block, next_instr, instr) {
                         // Walk our block backwards and find the creators of this vector creation instruction
                         for (unsigned i = 0; i < instr->dest_components; ++i) {
                                 // If this instruction is ther one that writes this register then forward it to the real register
                                 if (vector_ssa_sources[i] == next_instr->ssa_args.dest) {
                                         next_instr->ssa_args.dest = vector_ssa_sources[i];
                                         // Source instruction destination is a vector register of size dest_components
                                         // So dest + i gets the components of it
                                         next_instr->args.dest = instr->args.dest + i;
                                 }
                         }
                 }

                 // Remove the instruction now that we have copied over all the sources
                 mir_remove_instr(instr);
         }
 }

 static void
 remove_extract_elements(compiler_context *ctx, bifrost_block *block)
 {
         mir_foreach_instr_in_block_safe(block, instr) {
                 if (instr->op != op_extract_element) continue;

                 mir_foreach_instr_in_block_from(block, next_instr, instr) {
                         // Walk our block forward to replace uses of this register with a real register
                         // src0 = vector
                         // src1 = index in to vector
                         uint32_t vector_ssa_sources[4] = {
                                 next_instr->ssa_args.src0,
                                 next_instr->ssa_args.src1,
                                 next_instr->ssa_args.src2,
                                 next_instr->ssa_args.src3,
                         };
                         uint32_t *vector_sources[4] = {
                                 &next_instr->args.src0,
                                 &next_instr->args.src1,
                                 &next_instr->args.src2,
                                 &next_instr->args.src3,
                         };

                         for (unsigned i = 0; i < 4; ++i) {
                                 if (vector_ssa_sources[i] == instr->ssa_args.dest) {
                                         // This source uses this vector extraction
                                         // Replace its usage with the real register
                                         // src0 is a vector register and src1 is the constant element of the vector
                                         *vector_sources[i] = instr->args.src0 + instr->literal_args[0];
                                 }
                         }

                 }

                 // Remove the instruction now that we have copied over all the sources
                 mir_remove_instr(instr);
         }
 }


 void schedule_program(compiler_context *ctx)
 {
         // XXX: we should move instructions together before RA that can feed in to each other and be scheduled in the same clause
         allocate_registers(ctx);

         mir_foreach_block(ctx, block) {
                 remove_create_vectors(ctx, block);
                 remove_extract_elements(ctx, block);
         }

         mir_foreach_block(ctx, block) {
 #ifdef BI_DEBUG
                 print_mir_block(block, true);
 #endif

                 bundle_block(ctx, block);
         }
 }
	/*
	* Copyright (C) 2019 Ryan Houdek <Sonicadvance1@gmail.com>
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*/

	#include "util/register_allocate.h"
	#include "compiler_defines.h"
	#include "bifrost_sched.h"
	#include "bifrost_compile.h"
	#include "bifrost_print.h"

	#define BI_DEBUG
	const unsigned max_primary_reg = 64; /* Overestimate because of special regs */
	const unsigned max_vec2_reg = 32;
	const unsigned max_vec3_reg = 16; // XXX: Do we need to align vec3 to vec4 boundary?
	const unsigned max_vec4_reg = 16;
	const unsigned max_registers = 128; /* Sum of classes */
	const unsigned primary_base = 0;
	const unsigned vec2_base = 64;
	const unsigned vec3_base = 96; /* above base + max_class_reg */
	const unsigned vec4_base = 112;
	const unsigned vec4_end = 128;

	static unsigned
	find_or_allocate_temp(compiler_context *ctx, unsigned hash)
	{
	if (hash >= SSA_FIXED_MINIMUM)
	return hash;

	unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(ctx->hash_to_temp, hash + 1);

	if (temp)
	return temp - 1;

	/* If no temp is find, allocate one */
	temp = ctx->num_temps++;
	ctx->max_hash = MAX2(ctx->max_hash, hash);

	_mesa_hash_table_u64_insert(ctx->hash_to_temp, hash + 1, (void *) ((uintptr_t) temp + 1));

	return temp;
	}

	static bool
	is_live_in_instr(bifrost_instruction *instr, unsigned temp)
	{
	if (instr->ssa_args.src0 == temp) return true;
	if (instr->ssa_args.src1 == temp) return true;
	if (instr->ssa_args.src2 == temp) return true;
	if (instr->ssa_args.src3 == temp) return true;

	return false;
	}

	static bool
	is_live_after_instr(compiler_context ctx, bifrost_block blk, bifrost_instruction *instr, unsigned temp)
	{
	// Scan forward in the block from this location to see if we are still live.

	mir_foreach_instr_in_block_from(blk, ins, mir_next_instr(instr)) {
	if (is_live_in_instr(ins, temp))
	return true;
	}

	// XXX: Walk all successor blocks and ensure the value isn't used there

	return false;
	}

	static uint32_t
	ra_select_callback(struct ra_graph g, BITSET_WORD regs, void *data)
	{
	for (int i = primary_base; i < vec4_end; ++i) {
	if (BITSET_TEST(regs, i)) {
	return i;
	}
	}

	assert(0);
	return 0;
	}

	static uint32_t
	ra_get_phys_reg(compiler_context ctx, struct ra_graph g, unsigned temp, unsigned max_reg)
	{
	if (temp == SSA_INVALID_VALUE \|\|
	temp >= SSA_FIXED_UREG_MINIMUM \|\|
	temp == SSA_FIXED_CONST_0)
	return temp;

	if (temp >= SSA_FIXED_MINIMUM)
	return SSA_REG_FROM_FIXED(temp);

	assert(temp < max_reg);
	uint32_t r = ra_get_node_reg(g, temp);
	if (r >= vec4_base)
	return (r - vec4_base) * 4;
	else if (r >= vec3_base)
	return (r - vec3_base) * 4;
	else if (r >= vec2_base)
	return (r - vec2_base) * 2;

	return r;
	}

	static void
	allocate_registers(compiler_context *ctx)
	{
	struct ra_regs *regs = ra_alloc_reg_set(NULL, max_registers, true);

	int primary_class = ra_alloc_reg_class(regs);
	int vec2_class = ra_alloc_reg_class(regs);
	int vec3_class = ra_alloc_reg_class(regs);
	int vec4_class = ra_alloc_reg_class(regs);

	// Allocate our register classes and conflicts
	{
	unsigned reg = 0;
	unsigned primary_base = 0;

	// Add all of our primary scalar registers
	for (unsigned i = 0; i < max_primary_reg; ++i) {
	ra_class_add_reg(regs, primary_class, reg);
	reg++;
	}

	// Add all of our vec2 class registers
	// These alias with the scalar registers
	for (unsigned i = 0; i < max_vec2_reg; ++i) {
	ra_class_add_reg(regs, vec2_class, reg);

	// Tell RA that this conflicts with primary class registers
	// Make sure to tell the RA utility all conflict slots
	ra_add_reg_conflict(regs, reg, primary_base + i*2 + 0);
	ra_add_reg_conflict(regs, reg, primary_base + i*2 + 1);

	reg++;
	}

	// Add all of our vec3 class registers
	// These alias with the scalar registers
	for (unsigned i = 0; i < max_vec3_reg; ++i) {
	ra_class_add_reg(regs, vec3_class, reg);

	// Tell RA that this conflicts with primary class registers
	// Make sure to tell the RA utility all conflict slots
	// These are aligned to vec4 even though they only conflict with a vec3 wide slot
	ra_add_reg_conflict(regs, reg, primary_base + i*4 + 0);
	ra_add_reg_conflict(regs, reg, primary_base + i*4 + 1);
	ra_add_reg_conflict(regs, reg, primary_base + i*4 + 2);

	// State that this class conflicts with the vec2 class
	ra_add_reg_conflict(regs, reg, vec2_base + i*2 + 0);
	ra_add_reg_conflict(regs, reg, vec2_base + i*2 + 1);

	reg++;
	}

	// Add all of our vec4 class registers
	// These alias with the scalar registers
	for (unsigned i = 0; i < max_vec4_reg; ++i) {
	ra_class_add_reg(regs, vec4_class, reg);

	// Tell RA that this conflicts with primary class registers
	// Make sure to tell the RA utility all conflict slots
	// These are aligned to vec4 even though they only conflict with a vec3 wide slot
	ra_add_reg_conflict(regs, reg, primary_base + i*4 + 0);
	ra_add_reg_conflict(regs, reg, primary_base + i*4 + 1);
	ra_add_reg_conflict(regs, reg, primary_base + i*4 + 2);
	ra_add_reg_conflict(regs, reg, primary_base + i*4 + 3);

	// State that this class conflicts with the vec2 class
	ra_add_reg_conflict(regs, reg, vec2_base + i*2 + 0);
	ra_add_reg_conflict(regs, reg, vec2_base + i*2 + 1);

	// State that this class conflicts with the vec3 class
	// They conflict on the exact same location due to alignments
	ra_add_reg_conflict(regs, reg, vec3_base + i);

	reg++;
	}
	}

	ra_set_finalize(regs, NULL);
	mir_foreach_block(ctx, block) {
	mir_foreach_instr_in_block(block, instr) {
	instr->ssa_args.src0 = find_or_allocate_temp(ctx, instr->ssa_args.src0);
	instr->ssa_args.src1 = find_or_allocate_temp(ctx, instr->ssa_args.src1);
	instr->ssa_args.src2 = find_or_allocate_temp(ctx, instr->ssa_args.src2);
	instr->ssa_args.src3 = find_or_allocate_temp(ctx, instr->ssa_args.src3);
	instr->ssa_args.dest = find_or_allocate_temp(ctx, instr->ssa_args.dest);
	}
	}

	uint32_t nodes = ctx->num_temps;
	struct ra_graph *g = ra_alloc_interference_graph(regs, nodes);

	mir_foreach_block(ctx, block) {
	mir_foreach_instr_in_block(block, instr) {
	if (instr->ssa_args.dest >= SSA_FIXED_MINIMUM) continue;
	if (instr->dest_components == 4)
	ra_set_node_class(g, instr->ssa_args.dest, vec4_class);
	else if (instr->dest_components == 3)
	ra_set_node_class(g, instr->ssa_args.dest, vec3_class);
	else if (instr->dest_components == 2)
	ra_set_node_class(g, instr->ssa_args.dest, vec2_class);
	else
	ra_set_node_class(g, instr->ssa_args.dest, primary_class);
	}
	}

	uint32_t live_start = malloc(nodes sizeof(uint32_t));
	uint32_t live_end = malloc(nodes sizeof(uint32_t));

	memset(live_start, 0xFF, nodes * sizeof(uint32_t));
	memset(live_end, 0xFF, nodes * sizeof(uint32_t));

	uint32_t location = 0;
	mir_foreach_block(ctx, block) {
	mir_foreach_instr_in_block(block, instr) {
	if (instr->ssa_args.dest < SSA_FIXED_MINIMUM) {
	// If the destination isn't yet live before this point
	// then this is the point it becomes live since we wrote to it
	if (live_start[instr->ssa_args.dest] == ~0U) {
	live_start[instr->ssa_args.dest] = location;
	}
	}

	uint32_t sources[4] = {
	instr->ssa_args.src0,
	instr->ssa_args.src1,
	instr->ssa_args.src2,
	instr->ssa_args.src3,
	};

	for (unsigned i = 0; i < 4; ++i) {
	if (sources[i] >= SSA_FIXED_MINIMUM)
	continue;

	// If the source is no longer live after this instruction then we can end its liveness
	if (!is_live_after_instr(ctx, block, instr, sources[i])) {
	live_end[sources[i]] = location;
	}
	}
	++location;
	}
	}

	// Spin through the nodes quick and ensure they are all killed by the end of the program
	for (unsigned i = 0; i < nodes; ++i) {
	if (live_end[i] == ~0U)
	live_end[i] = location;
	}

	for (int i = 0; i < nodes; ++i) {
	for (int j = i + 1; j < nodes; ++j) {
	if (!(live_start[i] >= live_end[j] \|\| live_start[j] >= live_end[i])) {
	ra_add_node_interference(g, i, j);
	}
	}
	}

	ra_set_select_reg_callback(g, ra_select_callback, NULL);

	if (!ra_allocate(g)) {
	assert(0);
	}

	free(live_start);
	free(live_end);

	mir_foreach_block(ctx, block) {
	mir_foreach_instr_in_block(block, instr) {
	instr->args.src0 = ra_get_phys_reg(ctx, g, instr->ssa_args.src0, nodes);
	instr->args.src1 = ra_get_phys_reg(ctx, g, instr->ssa_args.src1, nodes);
	instr->args.src2 = ra_get_phys_reg(ctx, g, instr->ssa_args.src2, nodes);
	instr->args.src3 = ra_get_phys_reg(ctx, g, instr->ssa_args.src3, nodes);
	instr->args.dest = ra_get_phys_reg(ctx, g, instr->ssa_args.dest, nodes);
	}
	}
	}

	static void
	bundle_block(compiler_context ctx, bifrost_block block)
	{
	}

	static void
	remove_create_vectors(compiler_context ctx, bifrost_block block)
	{
	mir_foreach_instr_in_block_safe(block, instr) {
	if (instr->op != op_create_vector) continue;

	uint32_t vector_ssa_sources[4] = {
	instr->ssa_args.src0,
	instr->ssa_args.src1,
	instr->ssa_args.src2,
	instr->ssa_args.src3,
	};

	mir_foreach_instr_in_block_from_rev(block, next_instr, instr) {
	// Walk our block backwards and find the creators of this vector creation instruction
	for (unsigned i = 0; i < instr->dest_components; ++i) {
	// If this instruction is ther one that writes this register then forward it to the real register
	if (vector_ssa_sources[i] == next_instr->ssa_args.dest) {
	next_instr->ssa_args.dest = vector_ssa_sources[i];
	// Source instruction destination is a vector register of size dest_components
	// So dest + i gets the components of it
	next_instr->args.dest = instr->args.dest + i;
	}
	}
	}

	// Remove the instruction now that we have copied over all the sources
	mir_remove_instr(instr);
	}
	}

	static void
	remove_extract_elements(compiler_context ctx, bifrost_block block)
	{
	mir_foreach_instr_in_block_safe(block, instr) {
	if (instr->op != op_extract_element) continue;

	mir_foreach_instr_in_block_from(block, next_instr, instr) {
	// Walk our block forward to replace uses of this register with a real register
	// src0 = vector
	// src1 = index in to vector
	uint32_t vector_ssa_sources[4] = {
	next_instr->ssa_args.src0,
	next_instr->ssa_args.src1,
	next_instr->ssa_args.src2,
	next_instr->ssa_args.src3,
	};
	uint32_t *vector_sources[4] = {
	&next_instr->args.src0,
	&next_instr->args.src1,
	&next_instr->args.src2,
	&next_instr->args.src3,
	};

	for (unsigned i = 0; i < 4; ++i) {
	if (vector_ssa_sources[i] == instr->ssa_args.dest) {
	// This source uses this vector extraction
	// Replace its usage with the real register
	// src0 is a vector register and src1 is the constant element of the vector
	*vector_sources[i] = instr->args.src0 + instr->literal_args[0];
	}
	}

	}

	// Remove the instruction now that we have copied over all the sources
	mir_remove_instr(instr);
	}
	}


	void schedule_program(compiler_context *ctx)
	{
	// XXX: we should move instructions together before RA that can feed in to each other and be scheduled in the same clause
	allocate_registers(ctx);

	mir_foreach_block(ctx, block) {
	remove_create_vectors(ctx, block);
	remove_extract_elements(ctx, block);
	}

	mir_foreach_block(ctx, block) {
	#ifdef BI_DEBUG
	print_mir_block(block, true);
	#endif

	bundle_block(ctx, block);
	}
	}