src/amd/compiler/aco_reduce_assign.cpp - third_party/mesa - Git at Google

 /*
  * Copyright © 2018 Valve Corporation
  * Copyright © 2018 Google
  *
  * SPDX-License-Identifier: MIT
  */

 #include "aco_builder.h"
 #include "aco_ir.h"

 #include <vector>

 /*
  * Insert p_linear_start instructions right before RA to correctly allocate
  * temporaries for reductions that have to disrespect EXEC by executing in
  * WWM.
  */

 namespace aco {

 void
 setup_reduce_temp(Program* program)
 {
    unsigned last_top_level_block_idx = 0;
    unsigned maxSize = 0;

    std::vector<bool> hasReductions(program->blocks.size());
    for (Block& block : program->blocks) {
       for (aco_ptr<Instruction>& instr : block.instructions) {
          if (instr->opcode == aco_opcode::p_interp_gfx11 ||
              instr->opcode == aco_opcode::p_bpermute_permlane) {
             maxSize = MAX2(maxSize, 1);
             hasReductions[block.index] = true;
          } else if (instr->format == Format::PSEUDO_REDUCTION) {
             maxSize = MAX2(maxSize, instr->operands[0].size());
             hasReductions[block.index] = true;
          }
       }
    }

    if (maxSize == 0)
       return;

    assert(maxSize == 1 || maxSize == 2);
    Temp reduceTmp(0, RegClass(RegType::vgpr, maxSize).as_linear());
    Temp vtmp(0, RegClass(RegType::vgpr, maxSize).as_linear());
    int inserted_at = -1;
    int vtmp_inserted_at = -1;

    for (Block& block : program->blocks) {

       if (block.kind & block_kind_top_level) {
          last_top_level_block_idx = block.index;

          /* TODO: this could be improved in this case:
           *    start_linear_vgpr
           *    if (...) {
           *       use_linear_vgpr
           *    }
           *    end_linear_vgpr
           * Here, the linear vgpr is used before any phi copies, so this isn't necessary.
           */
          if (inserted_at >= 0) {
             aco_ptr<Instruction> end{create_instruction(
                aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_inserted_at >= 0 ? 2 : 1, 0)};
             end->operands[0] = Operand(reduceTmp);
             if (vtmp_inserted_at >= 0)
                end->operands[1] = Operand(vtmp);

             /* insert after the phis of the block */
             std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin();
             while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi)
                ++it;
             block.instructions.insert(it, std::move(end));
             inserted_at = vtmp_inserted_at = -1;
          }
       }

       if (!hasReductions[block.index])
          continue;

       std::vector<aco_ptr<Instruction>>::iterator it;
       for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
          Instruction* instr = (*it).get();
          if (instr->format != Format::PSEUDO_REDUCTION &&
              instr->opcode != aco_opcode::p_interp_gfx11 &&
              instr->opcode != aco_opcode::p_bpermute_permlane)
             continue;

          if ((int)last_top_level_block_idx != inserted_at) {
             reduceTmp = program->allocateTmp(reduceTmp.regClass());
             aco_ptr<Instruction> create{
                create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
             create->definitions[0] = Definition(reduceTmp);
             /* find the right place to insert this definition */
             if (last_top_level_block_idx == block.index) {
                /* insert right before the current instruction */
                it = block.instructions.insert(it, std::move(create));
                it++;
                /* inserted_at is intentionally not updated here, so later blocks
                 * would insert at the end instead of using this one. */
             } else {
                assert(last_top_level_block_idx < block.index);
                /* insert after p_logical_end of the last top-level block */
                std::vector<aco_ptr<Instruction>>& instructions =
                   program->blocks[last_top_level_block_idx].instructions;
                auto insert_point =
                   std::find_if(instructions.rbegin(), instructions.rend(),
                                [](const auto& iter) {
                                   return iter->opcode == aco_opcode::p_logical_end;
                                })
                      .base();
                instructions.insert(insert_point, std::move(create));
                inserted_at = last_top_level_block_idx;
             }
          }

          /* same as before, except for the vector temporary instead of the reduce temporary */
          bool need_vtmp = false;
          if (instr->isReduction()) {
             ReduceOp op = instr->reduction().reduce_op;
             unsigned cluster_size = instr->reduction().cluster_size;
             need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || op == fmin64 ||
                         op == fmax64 || op == umin64 || op == umax64 || op == imin64 ||
                         op == imax64 || op == imul64;
             bool gfx10_need_vtmp = op == imul8 || op == imax8 || op == imin8 || op == umin8 ||
                                    op == imul16 || op == imax16 || op == imin16 || op == umin16 ||
                                    op == iadd64;

             if (program->gfx_level >= GFX10 && cluster_size == 64)
                need_vtmp = true;
             if (program->gfx_level >= GFX10 && gfx10_need_vtmp)
                need_vtmp = true;
             if (program->gfx_level <= GFX7)
                need_vtmp = true;

             need_vtmp |= cluster_size == 32;
          }

          if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) {
             vtmp = program->allocateTmp(vtmp.regClass());
             aco_ptr<Instruction> create{
                create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
             create->definitions[0] = Definition(vtmp);
             if (last_top_level_block_idx == block.index) {
                it = block.instructions.insert(it, std::move(create));
                it++;
             } else {
                assert(last_top_level_block_idx < block.index);
                std::vector<aco_ptr<Instruction>>& instructions =
                   program->blocks[last_top_level_block_idx].instructions;
                auto insert_point =
                   std::find_if(instructions.rbegin(), instructions.rend(),
                                [](const auto& iter) {
                                   return iter->opcode == aco_opcode::p_logical_end;
                                })
                      .base();
                instructions.insert(insert_point, std::move(create));
                vtmp_inserted_at = last_top_level_block_idx;
             }
          }

          if (instr->isReduction()) {
             instr->operands[1] = Operand(reduceTmp);
             if (need_vtmp)
                instr->operands[2] = Operand(vtmp);
          } else {
             assert(instr->opcode == aco_opcode::p_interp_gfx11 ||
                    instr->opcode == aco_opcode::p_bpermute_permlane);
             instr->operands[0] = Operand(reduceTmp);
          }
       }
    }
 }

 }; // namespace aco
	/*
	* Copyright © 2018 Valve Corporation
	* Copyright © 2018 Google
	*
	* SPDX-License-Identifier: MIT
	*/

	#include "aco_builder.h"
	#include "aco_ir.h"

	#include <vector>

	/*
	* Insert p_linear_start instructions right before RA to correctly allocate
	* temporaries for reductions that have to disrespect EXEC by executing in
	* WWM.
	*/

	namespace aco {

	void
	setup_reduce_temp(Program* program)
	{
	unsigned last_top_level_block_idx = 0;
	unsigned maxSize = 0;

	std::vector<bool> hasReductions(program->blocks.size());
	for (Block& block : program->blocks) {
	for (aco_ptr<Instruction>& instr : block.instructions) {
	if (instr->opcode == aco_opcode::p_interp_gfx11 \|\|
	instr->opcode == aco_opcode::p_bpermute_permlane) {
	maxSize = MAX2(maxSize, 1);
	hasReductions[block.index] = true;
	} else if (instr->format == Format::PSEUDO_REDUCTION) {
	maxSize = MAX2(maxSize, instr->operands[0].size());
	hasReductions[block.index] = true;
	}
	}
	}

	if (maxSize == 0)
	return;

	assert(maxSize == 1 \|\| maxSize == 2);
	Temp reduceTmp(0, RegClass(RegType::vgpr, maxSize).as_linear());
	Temp vtmp(0, RegClass(RegType::vgpr, maxSize).as_linear());
	int inserted_at = -1;
	int vtmp_inserted_at = -1;

	for (Block& block : program->blocks) {

	if (block.kind & block_kind_top_level) {
	last_top_level_block_idx = block.index;

	/* TODO: this could be improved in this case:
	* start_linear_vgpr
	* if (...) {
	* use_linear_vgpr
	* }
	* end_linear_vgpr
	* Here, the linear vgpr is used before any phi copies, so this isn't necessary.
	*/
	if (inserted_at >= 0) {
	aco_ptr<Instruction> end{create_instruction(
	aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_inserted_at >= 0 ? 2 : 1, 0)};
	end->operands[0] = Operand(reduceTmp);
	if (vtmp_inserted_at >= 0)
	end->operands[1] = Operand(vtmp);

	/* insert after the phis of the block */
	std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin();
	while ((it)->opcode == aco_opcode::p_linear_phi \|\| (it)->opcode == aco_opcode::p_phi)
	++it;
	block.instructions.insert(it, std::move(end));
	inserted_at = vtmp_inserted_at = -1;
	}
	}

	if (!hasReductions[block.index])
	continue;

	std::vector<aco_ptr<Instruction>>::iterator it;
	for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
	Instruction* instr = (*it).get();
	if (instr->format != Format::PSEUDO_REDUCTION &&
	instr->opcode != aco_opcode::p_interp_gfx11 &&
	instr->opcode != aco_opcode::p_bpermute_permlane)
	continue;

	if ((int)last_top_level_block_idx != inserted_at) {
	reduceTmp = program->allocateTmp(reduceTmp.regClass());
	aco_ptr<Instruction> create{
	create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
	create->definitions[0] = Definition(reduceTmp);
	/* find the right place to insert this definition */
	if (last_top_level_block_idx == block.index) {
	/* insert right before the current instruction */
	it = block.instructions.insert(it, std::move(create));
	it++;
	/* inserted_at is intentionally not updated here, so later blocks
	* would insert at the end instead of using this one. */
	} else {
	assert(last_top_level_block_idx < block.index);
	/* insert after p_logical_end of the last top-level block */
	std::vector<aco_ptr<Instruction>>& instructions =
	program->blocks[last_top_level_block_idx].instructions;
	auto insert_point =
	std::find_if(instructions.rbegin(), instructions.rend(),
	[](const auto& iter) {
	return iter->opcode == aco_opcode::p_logical_end;
	})
	.base();
	instructions.insert(insert_point, std::move(create));
	inserted_at = last_top_level_block_idx;
	}
	}

	/* same as before, except for the vector temporary instead of the reduce temporary */
	bool need_vtmp = false;
	if (instr->isReduction()) {
	ReduceOp op = instr->reduction().reduce_op;
	unsigned cluster_size = instr->reduction().cluster_size;
	need_vtmp = op == imul32 \|\| op == fadd64 \|\| op == fmul64 \|\| op == fmin64 \|\|
	op == fmax64 \|\| op == umin64 \|\| op == umax64 \|\| op == imin64 \|\|
	op == imax64 \|\| op == imul64;
	bool gfx10_need_vtmp = op == imul8 \|\| op == imax8 \|\| op == imin8 \|\| op == umin8 \|\|
	op == imul16 \|\| op == imax16 \|\| op == imin16 \|\| op == umin16 \|\|
	op == iadd64;

	if (program->gfx_level >= GFX10 && cluster_size == 64)
	need_vtmp = true;
	if (program->gfx_level >= GFX10 && gfx10_need_vtmp)
	need_vtmp = true;
	if (program->gfx_level <= GFX7)
	need_vtmp = true;

	need_vtmp \|= cluster_size == 32;
	}

	if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) {
	vtmp = program->allocateTmp(vtmp.regClass());
	aco_ptr<Instruction> create{
	create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
	create->definitions[0] = Definition(vtmp);
	if (last_top_level_block_idx == block.index) {
	it = block.instructions.insert(it, std::move(create));
	it++;
	} else {
	assert(last_top_level_block_idx < block.index);
	std::vector<aco_ptr<Instruction>>& instructions =
	program->blocks[last_top_level_block_idx].instructions;
	auto insert_point =
	std::find_if(instructions.rbegin(), instructions.rend(),
	[](const auto& iter) {
	return iter->opcode == aco_opcode::p_logical_end;
	})
	.base();
	instructions.insert(insert_point, std::move(create));
	vtmp_inserted_at = last_top_level_block_idx;
	}
	}

	if (instr->isReduction()) {
	instr->operands[1] = Operand(reduceTmp);
	if (need_vtmp)
	instr->operands[2] = Operand(vtmp);
	} else {
	assert(instr->opcode == aco_opcode::p_interp_gfx11 \|\|
	instr->opcode == aco_opcode::p_bpermute_permlane);
	instr->operands[0] = Operand(reduceTmp);
	}
	}
	}
	}

	}; // namespace aco