src/amd/compiler/aco_statistics.cpp - third_party/mesa - Git at Google

 /*
  * Copyright © 2020 Valve Corporation
  *
  * SPDX-License-Identifier: MIT
  */

 #include "aco_ir.h"

 #include "util/crc32.h"

 #include <algorithm>
 #include <deque>
 #include <set>
 #include <vector>

 namespace aco {

 namespace {

 class BlockCycleEstimator {
 public:
    enum resource {
       null = 0,
       scalar,
       branch_sendmsg,
       valu,
       valu_complex,
       lds,
       export_gds,
       vmem,
       resource_count,
    };

    BlockCycleEstimator(Program* program_) : program(program_) {}

    Program* program;

    int32_t cur_cycle = 0;
    int32_t res_available[(int)BlockCycleEstimator::resource_count] = {0};
    unsigned res_usage[(int)BlockCycleEstimator::resource_count] = {0};
    int32_t reg_available[512] = {0};
    std::deque<int32_t> mem_ops[wait_type_num];

    void add(aco_ptr<Instruction>& instr);
    void join(const BlockCycleEstimator& other);

 private:
    unsigned get_waitcnt_cost(wait_imm imm);
    unsigned get_dependency_cost(aco_ptr<Instruction>& instr);

    void use_resources(aco_ptr<Instruction>& instr);
    int32_t cycles_until_res_available(aco_ptr<Instruction>& instr);
 };

 struct perf_info {
    int latency;

    BlockCycleEstimator::resource rsrc0;
    unsigned cost0;

    BlockCycleEstimator::resource rsrc1;
    unsigned cost1;
 };

 static bool
 is_dual_issue_capable(const Program& program, const Instruction& instr)
 {
    if (program.gfx_level < GFX11 || !instr.isVALU() || instr.isDPP())
       return false;

    switch (instr.opcode) {
    case aco_opcode::v_fma_f32:
    case aco_opcode::v_fmac_f32:
    case aco_opcode::v_fmaak_f32:
    case aco_opcode::v_fmamk_f32:
    case aco_opcode::v_mul_f32:
    case aco_opcode::v_add_f32:
    case aco_opcode::v_sub_f32:
    case aco_opcode::v_subrev_f32:
    case aco_opcode::v_mul_legacy_f32:
    case aco_opcode::v_fma_legacy_f32:
    case aco_opcode::v_fmac_legacy_f32:
    case aco_opcode::v_fma_f16:
    case aco_opcode::v_fmac_f16:
    case aco_opcode::v_fmaak_f16:
    case aco_opcode::v_fmamk_f16:
    case aco_opcode::v_mul_f16:
    case aco_opcode::v_add_f16:
    case aco_opcode::v_sub_f16:
    case aco_opcode::v_subrev_f16:
    case aco_opcode::v_mov_b32:
    case aco_opcode::v_movreld_b32:
    case aco_opcode::v_movrels_b32:
    case aco_opcode::v_movrelsd_b32:
    case aco_opcode::v_movrelsd_2_b32:
    case aco_opcode::v_cndmask_b32:
    case aco_opcode::v_writelane_b32_e64:
    case aco_opcode::v_mov_b16:
    case aco_opcode::v_cndmask_b16:
    case aco_opcode::v_max_f32:
    case aco_opcode::v_min_f32:
    case aco_opcode::v_max_f16:
    case aco_opcode::v_min_f16:
    case aco_opcode::v_max_i16_e64:
    case aco_opcode::v_min_i16_e64:
    case aco_opcode::v_max_u16_e64:
    case aco_opcode::v_min_u16_e64:
    case aco_opcode::v_add_i16:
    case aco_opcode::v_sub_i16:
    case aco_opcode::v_mad_i16:
    case aco_opcode::v_add_u16_e64:
    case aco_opcode::v_sub_u16_e64:
    case aco_opcode::v_mad_u16:
    case aco_opcode::v_mul_lo_u16_e64:
    case aco_opcode::v_not_b16:
    case aco_opcode::v_and_b16:
    case aco_opcode::v_or_b16:
    case aco_opcode::v_xor_b16:
    case aco_opcode::v_lshrrev_b16_e64:
    case aco_opcode::v_ashrrev_i16_e64:
    case aco_opcode::v_lshlrev_b16_e64:
    case aco_opcode::v_dot2_bf16_bf16:
    case aco_opcode::v_dot2_f32_bf16:
    case aco_opcode::v_dot2_f16_f16:
    case aco_opcode::v_dot2_f32_f16:
    case aco_opcode::v_dot2c_f32_f16: return true;
    case aco_opcode::v_fma_mix_f32:
    case aco_opcode::v_fma_mixlo_f16:
    case aco_opcode::v_fma_mixhi_f16: {
       /* dst and acc type must match */
       if (instr.valu().opsel_hi[2] == (instr.opcode == aco_opcode::v_fma_mix_f32))
          return false;

       /* If all operands are vgprs, two must be the same. */
       for (unsigned i = 0; i < 3; i++) {
          if (instr.operands[i].isConstant() || instr.operands[i].isOfType(RegType::sgpr))
             return true;
          for (unsigned j = 0; j < i; j++) {
             if (instr.operands[i].physReg() == instr.operands[j].physReg())
                return true;
          }
       }
       return false;
    }
    default:
       if (instr.isVINTERP_INREG())
          return program.gfx_level >= GFX11_5;
       if (instr.isVOPC() && instr_info.classes[(int)instr.opcode] == instr_class::valu32)
          return program.gfx_level == GFX11_5;
       return false;
    }
 }

 static perf_info
 get_perf_info(const Program& program, const Instruction& instr)
 {
    instr_class cls = instr_info.classes[(int)instr.opcode];

 #define WAIT(res)          BlockCycleEstimator::res, 0
 #define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt

    if (program.gfx_level >= GFX10) {
       /* fp64 might be incorrect */
       switch (cls) {
       case instr_class::valu32:
       case instr_class::valu_convert32:
       case instr_class::valu_fma: return {5, WAIT_USE(valu, 1)};
       case instr_class::valu64: return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)};
       case instr_class::valu_quarter_rate32:
          return {8, WAIT_USE(valu, 4), WAIT_USE(valu_complex, 4)};
       case instr_class::valu_transcendental32:
          return {10, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 4)};
       case instr_class::valu_double: return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
       case instr_class::valu_double_add:
          return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
       case instr_class::valu_double_convert:
          return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
       case instr_class::valu_double_transcendental:
          return {24, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
       case instr_class::salu: return {2, WAIT_USE(scalar, 1)};
       case instr_class::sfpu: return {4, WAIT_USE(scalar, 1)};
       case instr_class::valu_pseudo_scalar_trans:
          return {7, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 1)};
       case instr_class::smem: return {0, WAIT_USE(scalar, 1)};
       case instr_class::branch:
       case instr_class::sendmsg: return {0, WAIT_USE(branch_sendmsg, 3)};
       case instr_class::ds:
          return instr.isDS() && instr.ds().gds ? perf_info{0, WAIT_USE(export_gds, 1)}
                                                : perf_info{0, WAIT_USE(lds, 1)};
       case instr_class::exp: return {0, WAIT_USE(export_gds, 1)};
       case instr_class::vmem: return {0, WAIT_USE(vmem, 1)};
       case instr_class::wmma: {
          /* int8 and (b)f16 have the same performance. */
          uint8_t cost = instr.opcode == aco_opcode::v_wmma_i32_16x16x16_iu4 ? 16 : 32;
          return {cost, WAIT_USE(valu, cost)};
       }
       case instr_class::barrier:
       case instr_class::waitcnt:
       case instr_class::other:
       default: return {0};
       }
    } else {
       switch (cls) {
       case instr_class::valu32: return {4, WAIT_USE(valu, 4)};
       case instr_class::valu_convert32: return {16, WAIT_USE(valu, 16)};
       case instr_class::valu64: return {8, WAIT_USE(valu, 8)};
       case instr_class::valu_quarter_rate32: return {16, WAIT_USE(valu, 16)};
       case instr_class::valu_fma:
          return program.dev.has_fast_fma32 ? perf_info{4, WAIT_USE(valu, 4)}
                                            : perf_info{16, WAIT_USE(valu, 16)};
       case instr_class::valu_transcendental32: return {16, WAIT_USE(valu, 16)};
       case instr_class::valu_double: return {64, WAIT_USE(valu, 64)};
       case instr_class::valu_double_add: return {32, WAIT_USE(valu, 32)};
       case instr_class::valu_double_convert: return {16, WAIT_USE(valu, 16)};
       case instr_class::valu_double_transcendental: return {64, WAIT_USE(valu, 64)};
       case instr_class::salu: return {4, WAIT_USE(scalar, 4)};
       case instr_class::smem: return {4, WAIT_USE(scalar, 4)};
       case instr_class::branch: return {4, WAIT_USE(branch_sendmsg, 4)};
       case instr_class::ds:
          return instr.isDS() && instr.ds().gds ? perf_info{4, WAIT_USE(export_gds, 4)}
                                                : perf_info{4, WAIT_USE(lds, 4)};
       case instr_class::exp: return {16, WAIT_USE(export_gds, 16)};
       case instr_class::vmem: return {4, WAIT_USE(vmem, 4)};
       case instr_class::barrier:
       case instr_class::waitcnt:
       case instr_class::other:
       default: return {4};
       }
    }

 #undef WAIT_USE
 #undef WAIT
 }

 void
 BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
 {
    perf_info perf = get_perf_info(*program, *instr);

    if (perf.rsrc0 != resource_count) {
       res_available[(int)perf.rsrc0] = cur_cycle + perf.cost0;
       res_usage[(int)perf.rsrc0] += perf.cost0;
    }

    if (perf.rsrc1 != resource_count) {
       res_available[(int)perf.rsrc1] = cur_cycle + perf.cost1;
       res_usage[(int)perf.rsrc1] += perf.cost1;
    }
 }

 int32_t
 BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr)
 {
    perf_info perf = get_perf_info(*program, *instr);

    int32_t cost = 0;
    if (perf.rsrc0 != resource_count)
       cost = MAX2(cost, res_available[(int)perf.rsrc0] - cur_cycle);
    if (perf.rsrc1 != resource_count)
       cost = MAX2(cost, res_available[(int)perf.rsrc1] - cur_cycle);

    return cost;
 }

 static std::array<unsigned, wait_type_num>
 get_wait_counter_info(Program* program, aco_ptr<Instruction>& instr)
 {
    /* These numbers are all a bit nonsense. LDS/VMEM/SMEM/EXP performance
     * depends a lot on the situation. */

    std::array<unsigned, wait_type_num> info{};

    if (instr->isEXP()) {
       info[wait_type_exp] = 16;
    } else if (instr->isLDSDIR()) {
       info[wait_type_exp] = 13;
    } else if (instr->isFlatLike()) {
       info[wait_type_lgkm] = instr->isFlat() ? 20 : 0;
       if (!instr->definitions.empty() || program->gfx_level < GFX10)
          info[wait_type_vm] = 320;
       else
          info[wait_type_vs] = 320;
    } else if (instr->isSMEM()) {
       wait_type type = program->gfx_level >= GFX12 ? wait_type_km : wait_type_lgkm;
       if (instr->definitions.empty()) {
          info[type] = 200;
       } else if (instr->operands.empty()) { /* s_memtime and s_memrealtime */
          info[type] = 1;
       } else {
          bool likely_desc_load = instr->operands[0].size() == 2;
          bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);
          bool const_offset =
             instr->operands[1].isConstant() && (!soe || instr->operands.back().isConstant());

          if (likely_desc_load || const_offset)
             info[type] = 30; /* likely to hit L0 cache */
          else
             info[type] = 200;
       }
    } else if (instr->isDS()) {
       info[wait_type_lgkm] = 20;
    } else if (instr->isVMEM() && instr->definitions.empty() && program->gfx_level >= GFX10) {
       info[wait_type_vs] = 320;
    } else if (instr->isVMEM()) {
       uint8_t vm_type = get_vmem_type(program->gfx_level, program->family, instr.get());
       wait_type type = wait_type_vm;
       if (program->gfx_level >= GFX12 && vm_type == vmem_bvh)
          type = wait_type_bvh;
       else if (program->gfx_level >= GFX12 && vm_type == vmem_sampler)
          type = wait_type_sample;
       info[type] = 320;
    }

    return info;
 }

 static wait_imm
 get_wait_imm(Program* program, aco_ptr<Instruction>& instr)
 {
    wait_imm imm;
    if (instr->opcode == aco_opcode::s_endpgm) {
       for (unsigned i = 0; i < wait_type_num; i++)
          imm[i] = 0;
    } else if (imm.unpack(program->gfx_level, instr.get())) {
    } else if (instr->isVINTERP_INREG()) {
       imm.exp = instr->vinterp_inreg().wait_exp;
       if (imm.exp == 0x7)
          imm.exp = wait_imm::unset_counter;
    } else {
       /* If an instruction increases a counter, it waits for it to be below maximum first. */
       std::array<unsigned, wait_type_num> wait_info = get_wait_counter_info(program, instr);
       wait_imm max = wait_imm::max(program->gfx_level);
       for (unsigned i = 0; i < wait_type_num; i++) {
          if (wait_info[i])
             imm[i] = max[i] - 1;
       }
    }
    return imm;
 }

 unsigned
 BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
 {
    int deps_available = cur_cycle;

    wait_imm imm = get_wait_imm(program, instr);
    for (unsigned i = 0; i < wait_type_num; i++) {
       if (imm[i] == wait_imm::unset_counter)
          continue;
       for (int j = 0; j < (int)mem_ops[i].size() - imm[i]; j++)
          deps_available = MAX2(deps_available, mem_ops[i][j]);
    }

    if (instr->opcode == aco_opcode::s_endpgm) {
       for (unsigned i = 0; i < 512; i++)
          deps_available = MAX2(deps_available, reg_available[i]);
    } else if (program->gfx_level >= GFX10) {
       for (Operand& op : instr->operands) {
          if (op.isConstant() || op.isUndefined())
             continue;
          for (unsigned i = 0; i < op.size(); i++)
             deps_available = MAX2(deps_available, reg_available[op.physReg().reg() + i]);
       }
    }

    if (program->gfx_level < GFX10)
       deps_available = align(deps_available, 4);

    return deps_available - cur_cycle;
 }

 static bool
 is_vector(aco_opcode op)
 {
    switch (instr_info.classes[(int)op]) {
    case instr_class::valu32:
    case instr_class::valu_convert32:
    case instr_class::valu_fma:
    case instr_class::valu_double:
    case instr_class::valu_double_add:
    case instr_class::valu_double_convert:
    case instr_class::valu_double_transcendental:
    case instr_class::vmem:
    case instr_class::ds:
    case instr_class::exp:
    case instr_class::valu64:
    case instr_class::valu_quarter_rate32:
    case instr_class::valu_transcendental32: return true;
    default: return false;
    }
 }

 void
 BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
 {
    perf_info perf = get_perf_info(*program, *instr);

    cur_cycle += get_dependency_cost(instr);

    unsigned start;
    bool dual_issue = program->gfx_level >= GFX10 && program->wave_size == 64 &&
                      is_vector(instr->opcode) && !is_dual_issue_capable(*program, *instr) &&
                      program->workgroup_size > 32;
    for (unsigned i = 0; i < (dual_issue ? 2 : 1); i++) {
       cur_cycle += cycles_until_res_available(instr);

       start = cur_cycle;
       use_resources(instr);

       /* GCN is in-order and doesn't begin the next instruction until the current one finishes */
       cur_cycle += program->gfx_level >= GFX10 ? 1 : perf.latency;
    }

    wait_imm imm = get_wait_imm(program, instr);
    for (unsigned i = 0; i < wait_type_num; i++) {
       while (mem_ops[i].size() > imm[i])
          mem_ops[i].pop_front();
    }

    std::array<unsigned, wait_type_num> wait_info = get_wait_counter_info(program, instr);
    for (unsigned i = 0; i < wait_type_num; i++) {
       if (wait_info[i])
          mem_ops[i].push_back(cur_cycle + wait_info[i]);
    }

    /* This is inaccurate but shouldn't affect anything after waitcnt insertion.
     * Before waitcnt insertion, this is necessary to consider memory operations.
     */
    unsigned latency = 0;
    for (unsigned i = 0; i < wait_type_num; i++)
       latency = MAX2(latency, i == wait_type_vs ? 0 : wait_info[i]);
    int32_t result_available = start + MAX2(perf.latency, (int32_t)latency);

    for (Definition& def : instr->definitions) {
       int32_t* available = &reg_available[def.physReg().reg()];
       for (unsigned i = 0; i < def.size(); i++)
          available[i] = MAX2(available[i], result_available);
    }
 }

 void
 BlockCycleEstimator::join(const BlockCycleEstimator& pred)
 {
    assert(cur_cycle == 0);

    for (unsigned i = 0; i < (unsigned)resource_count; i++) {
       assert(res_usage[i] == 0);
       res_available[i] = MAX2(res_available[i], pred.res_available[i] - pred.cur_cycle);
    }

    for (unsigned i = 0; i < 512; i++)
       reg_available[i] = MAX2(reg_available[i], pred.reg_available[i] - pred.cur_cycle + cur_cycle);

    for (unsigned i = 0; i < wait_type_num; i++) {
       std::deque<int32_t>& ops = mem_ops[i];
       const std::deque<int32_t>& pred_ops = pred.mem_ops[i];
       for (unsigned j = 0; j < MIN2(ops.size(), pred_ops.size()); j++)
          ops.rbegin()[j] = MAX2(ops.rbegin()[j], pred_ops.rbegin()[j] - pred.cur_cycle);
       for (int j = pred_ops.size() - ops.size() - 1; j >= 0; j--)
          ops.push_front(pred_ops[j] - pred.cur_cycle);
    }
 }

 } /* end namespace */

 /* sgpr_presched/vgpr_presched */
 void
 collect_presched_stats(Program* program)
 {
    RegisterDemand presched_demand;
    for (Block& block : program->blocks)
       presched_demand.update(block.register_demand);
    program->statistics.presgprs = presched_demand.sgpr;
    program->statistics.prevgprs = presched_demand.vgpr;
 }

 /* instructions/branches/vmem_clauses/smem_clauses/cycles */
 void
 collect_preasm_stats(Program* program)
 {
    for (Block& block : program->blocks) {
       std::set<Instruction*> vmem_clause;
       std::set<Instruction*> smem_clause;

       program->statistics.instrs += block.instructions.size();

       for (aco_ptr<Instruction>& instr : block.instructions) {
          const bool is_branch =
             instr->isSOPP() && instr_info.classes[(int)instr->opcode] == instr_class::branch;
          if (is_branch)
             program->statistics.branches++;

          if (instr->isVALU() || instr->isVINTRP())
             program->statistics.valu++;
          if (instr->isSALU() && !instr->isSOPP() &&
              instr_info.classes[(int)instr->opcode] != instr_class::waitcnt)
             program->statistics.salu++;
          if (instr->isVOPD())
             program->statistics.vopd++;

          if ((instr->isVMEM() || instr->isFlatLike()) && !instr->operands.empty()) {
             if (std::none_of(vmem_clause.begin(), vmem_clause.end(),
                              [&](Instruction* other)
                              { return should_form_clause(instr.get(), other); }))
                program->statistics.vclause++;
             vmem_clause.insert(instr.get());

             program->statistics.vmem++;
          } else {
             vmem_clause.clear();
          }

          if (instr->isSMEM() && !instr->operands.empty()) {
             if (std::none_of(smem_clause.begin(), smem_clause.end(),
                              [&](Instruction* other)
                              { return should_form_clause(instr.get(), other); }))
                program->statistics.sclause++;
             smem_clause.insert(instr.get());

             program->statistics.smem++;
          } else {
             smem_clause.clear();
          }
       }
    }

    double latency = 0;
    double usage[(int)BlockCycleEstimator::resource_count] = {0};
    std::vector<BlockCycleEstimator> blocks(program->blocks.size(), program);

    constexpr const unsigned vmem_latency = 320;
    for (const Definition def : program->args_pending_vmem) {
       blocks[0].mem_ops[wait_type_vm].push_back(vmem_latency);
       for (unsigned i = 0; i < def.size(); i++)
          blocks[0].reg_available[def.physReg().reg() + i] = vmem_latency;
    }

    for (Block& block : program->blocks) {
       BlockCycleEstimator& block_est = blocks[block.index];
       for (unsigned pred : block.linear_preds)
          block_est.join(blocks[pred]);

       for (aco_ptr<Instruction>& instr : block.instructions) {
          unsigned before = block_est.cur_cycle;
          block_est.add(instr);
          instr->pass_flags = block_est.cur_cycle - before;
       }

       /* TODO: it would be nice to be able to consider estimated loop trip
        * counts used for loop unrolling.
        */

       /* TODO: estimate the trip_count of divergent loops (those which break
        * divergent) higher than of uniform loops
        */

       /* Assume loops execute 8-2 times, uniform branches are taken 50% the time,
        * and any lane in the wave takes a side of a divergent branch 75% of the
        * time.
        */
       double iter = 1.0;
       iter *= block.loop_nest_depth > 0 ? 8.0 : 1.0;
       iter *= block.loop_nest_depth > 1 ? 4.0 : 1.0;
       iter *= block.loop_nest_depth > 2 ? pow(2.0, block.loop_nest_depth - 2) : 1.0;
       iter *= pow(0.5, block.uniform_if_depth);
       iter *= pow(0.75, block.divergent_if_logical_depth);

       bool divergent_if_linear_else =
          block.logical_preds.empty() && block.linear_preds.size() == 1 &&
          block.linear_succs.size() == 1 &&
          program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert);
       if (divergent_if_linear_else)
          iter *= 0.25;

       latency += block_est.cur_cycle * iter;
       for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++)
          usage[i] += block_est.res_usage[i] * iter;
    }

    /* This likely exaggerates the effectiveness of parallelism because it
     * ignores instruction ordering. It can assume there might be SALU/VALU/etc
     * work to from other waves while one is idle but that might not be the case
     * because those other waves have not reached such a point yet.
     */

    double parallelism = program->num_waves;
    for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++) {
       if (usage[i] > 0.0)
          parallelism = MIN2(parallelism, latency / usage[i]);
    }
    double waves_per_cycle = 1.0 / latency * parallelism;
    double wave64_per_cycle = waves_per_cycle * (program->wave_size / 64.0);

    double max_utilization = 1.0;
    if (program->workgroup_size != UINT_MAX)
       max_utilization =
          program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);
    wave64_per_cycle *= max_utilization;

    program->statistics.latency = round(latency);
    program->statistics.invthroughput = round(1.0 / wave64_per_cycle);

    if (debug_flags & DEBUG_PERF_INFO) {
       aco_print_program(program, stderr, print_no_ssa | print_perf_info);

       fprintf(stderr, "num_waves: %u\n", program->num_waves);
       fprintf(stderr, "salu_smem_usage: %f\n", usage[(int)BlockCycleEstimator::scalar]);
       fprintf(stderr, "branch_sendmsg_usage: %f\n",
               usage[(int)BlockCycleEstimator::branch_sendmsg]);
       fprintf(stderr, "valu_usage: %f\n", usage[(int)BlockCycleEstimator::valu]);
       fprintf(stderr, "valu_complex_usage: %f\n", usage[(int)BlockCycleEstimator::valu_complex]);
       fprintf(stderr, "lds_usage: %f\n", usage[(int)BlockCycleEstimator::lds]);
       fprintf(stderr, "export_gds_usage: %f\n", usage[(int)BlockCycleEstimator::export_gds]);
       fprintf(stderr, "vmem_usage: %f\n", usage[(int)BlockCycleEstimator::vmem]);
       fprintf(stderr, "latency: %f\n", latency);
       fprintf(stderr, "parallelism: %f\n", parallelism);
       fprintf(stderr, "max_utilization: %f\n", max_utilization);
       fprintf(stderr, "wave64_per_cycle: %f\n", wave64_per_cycle);
       fprintf(stderr, "\n");
    }
 }

 void
 collect_postasm_stats(Program* program, const std::vector<uint32_t>& code)
 {
    program->statistics.hash = util_hash_crc32(code.data(), code.size() * 4);
 }

 Instruction_cycle_info
 get_cycle_info(const Program& program, const Instruction& instr)
 {
    perf_info info = get_perf_info(program, instr);
    return Instruction_cycle_info{(unsigned)info.latency, std::max(info.cost0, info.cost1)};
 }

 } // namespace aco
	/*
	* Copyright © 2020 Valve Corporation
	*
	* SPDX-License-Identifier: MIT
	*/

	#include "aco_ir.h"

	#include "util/crc32.h"

	#include <algorithm>
	#include <deque>
	#include <set>
	#include <vector>

	namespace aco {

	namespace {

	class BlockCycleEstimator {
	public:
	enum resource {
	null = 0,
	scalar,
	branch_sendmsg,
	valu,
	valu_complex,
	lds,
	export_gds,
	vmem,
	resource_count,
	};

	BlockCycleEstimator(Program* program_) : program(program_) {}

	Program* program;

	int32_t cur_cycle = 0;
	int32_t res_available[(int)BlockCycleEstimator::resource_count] = {0};
	unsigned res_usage[(int)BlockCycleEstimator::resource_count] = {0};
	int32_t reg_available[512] = {0};
	std::deque<int32_t> mem_ops[wait_type_num];

	void add(aco_ptr<Instruction>& instr);
	void join(const BlockCycleEstimator& other);

	private:
	unsigned get_waitcnt_cost(wait_imm imm);
	unsigned get_dependency_cost(aco_ptr<Instruction>& instr);

	void use_resources(aco_ptr<Instruction>& instr);
	int32_t cycles_until_res_available(aco_ptr<Instruction>& instr);
	};

	struct perf_info {
	int latency;

	BlockCycleEstimator::resource rsrc0;
	unsigned cost0;

	BlockCycleEstimator::resource rsrc1;
	unsigned cost1;
	};

	static bool
	is_dual_issue_capable(const Program& program, const Instruction& instr)
	{
	if (program.gfx_level < GFX11 \|\| !instr.isVALU() \|\| instr.isDPP())
	return false;

	switch (instr.opcode) {
	case aco_opcode::v_fma_f32:
	case aco_opcode::v_fmac_f32:
	case aco_opcode::v_fmaak_f32:
	case aco_opcode::v_fmamk_f32:
	case aco_opcode::v_mul_f32:
	case aco_opcode::v_add_f32:
	case aco_opcode::v_sub_f32:
	case aco_opcode::v_subrev_f32:
	case aco_opcode::v_mul_legacy_f32:
	case aco_opcode::v_fma_legacy_f32:
	case aco_opcode::v_fmac_legacy_f32:
	case aco_opcode::v_fma_f16:
	case aco_opcode::v_fmac_f16:
	case aco_opcode::v_fmaak_f16:
	case aco_opcode::v_fmamk_f16:
	case aco_opcode::v_mul_f16:
	case aco_opcode::v_add_f16:
	case aco_opcode::v_sub_f16:
	case aco_opcode::v_subrev_f16:
	case aco_opcode::v_mov_b32:
	case aco_opcode::v_movreld_b32:
	case aco_opcode::v_movrels_b32:
	case aco_opcode::v_movrelsd_b32:
	case aco_opcode::v_movrelsd_2_b32:
	case aco_opcode::v_cndmask_b32:
	case aco_opcode::v_writelane_b32_e64:
	case aco_opcode::v_mov_b16:
	case aco_opcode::v_cndmask_b16:
	case aco_opcode::v_max_f32:
	case aco_opcode::v_min_f32:
	case aco_opcode::v_max_f16:
	case aco_opcode::v_min_f16:
	case aco_opcode::v_max_i16_e64:
	case aco_opcode::v_min_i16_e64:
	case aco_opcode::v_max_u16_e64:
	case aco_opcode::v_min_u16_e64:
	case aco_opcode::v_add_i16:
	case aco_opcode::v_sub_i16:
	case aco_opcode::v_mad_i16:
	case aco_opcode::v_add_u16_e64:
	case aco_opcode::v_sub_u16_e64:
	case aco_opcode::v_mad_u16:
	case aco_opcode::v_mul_lo_u16_e64:
	case aco_opcode::v_not_b16:
	case aco_opcode::v_and_b16:
	case aco_opcode::v_or_b16:
	case aco_opcode::v_xor_b16:
	case aco_opcode::v_lshrrev_b16_e64:
	case aco_opcode::v_ashrrev_i16_e64:
	case aco_opcode::v_lshlrev_b16_e64:
	case aco_opcode::v_dot2_bf16_bf16:
	case aco_opcode::v_dot2_f32_bf16:
	case aco_opcode::v_dot2_f16_f16:
	case aco_opcode::v_dot2_f32_f16:
	case aco_opcode::v_dot2c_f32_f16: return true;
	case aco_opcode::v_fma_mix_f32:
	case aco_opcode::v_fma_mixlo_f16:
	case aco_opcode::v_fma_mixhi_f16: {
	/* dst and acc type must match */
	if (instr.valu().opsel_hi[2] == (instr.opcode == aco_opcode::v_fma_mix_f32))
	return false;

	/* If all operands are vgprs, two must be the same. */
	for (unsigned i = 0; i < 3; i++) {
	if (instr.operands[i].isConstant() \|\| instr.operands[i].isOfType(RegType::sgpr))
	return true;
	for (unsigned j = 0; j < i; j++) {
	if (instr.operands[i].physReg() == instr.operands[j].physReg())
	return true;
	}
	}
	return false;
	}
	default:
	if (instr.isVINTERP_INREG())
	return program.gfx_level >= GFX11_5;
	if (instr.isVOPC() && instr_info.classes[(int)instr.opcode] == instr_class::valu32)
	return program.gfx_level == GFX11_5;
	return false;
	}
	}

	static perf_info
	get_perf_info(const Program& program, const Instruction& instr)
	{
	instr_class cls = instr_info.classes[(int)instr.opcode];

	#define WAIT(res) BlockCycleEstimator::res, 0
	#define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt

	if (program.gfx_level >= GFX10) {
	/* fp64 might be incorrect */
	switch (cls) {
	case instr_class::valu32:
	case instr_class::valu_convert32:
	case instr_class::valu_fma: return {5, WAIT_USE(valu, 1)};
	case instr_class::valu64: return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)};
	case instr_class::valu_quarter_rate32:
	return {8, WAIT_USE(valu, 4), WAIT_USE(valu_complex, 4)};
	case instr_class::valu_transcendental32:
	return {10, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 4)};
	case instr_class::valu_double: return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
	case instr_class::valu_double_add:
	return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
	case instr_class::valu_double_convert:
	return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
	case instr_class::valu_double_transcendental:
	return {24, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
	case instr_class::salu: return {2, WAIT_USE(scalar, 1)};
	case instr_class::sfpu: return {4, WAIT_USE(scalar, 1)};
	case instr_class::valu_pseudo_scalar_trans:
	return {7, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 1)};
	case instr_class::smem: return {0, WAIT_USE(scalar, 1)};
	case instr_class::branch:
	case instr_class::sendmsg: return {0, WAIT_USE(branch_sendmsg, 3)};
	case instr_class::ds:
	return instr.isDS() && instr.ds().gds ? perf_info{0, WAIT_USE(export_gds, 1)}
	: perf_info{0, WAIT_USE(lds, 1)};
	case instr_class::exp: return {0, WAIT_USE(export_gds, 1)};
	case instr_class::vmem: return {0, WAIT_USE(vmem, 1)};
	case instr_class::wmma: {
	/* int8 and (b)f16 have the same performance. */
	uint8_t cost = instr.opcode == aco_opcode::v_wmma_i32_16x16x16_iu4 ? 16 : 32;
	return {cost, WAIT_USE(valu, cost)};
	}
	case instr_class::barrier:
	case instr_class::waitcnt:
	case instr_class::other:
	default: return {0};
	}
	} else {
	switch (cls) {
	case instr_class::valu32: return {4, WAIT_USE(valu, 4)};
	case instr_class::valu_convert32: return {16, WAIT_USE(valu, 16)};
	case instr_class::valu64: return {8, WAIT_USE(valu, 8)};
	case instr_class::valu_quarter_rate32: return {16, WAIT_USE(valu, 16)};
	case instr_class::valu_fma:
	return program.dev.has_fast_fma32 ? perf_info{4, WAIT_USE(valu, 4)}
	: perf_info{16, WAIT_USE(valu, 16)};
	case instr_class::valu_transcendental32: return {16, WAIT_USE(valu, 16)};
	case instr_class::valu_double: return {64, WAIT_USE(valu, 64)};
	case instr_class::valu_double_add: return {32, WAIT_USE(valu, 32)};
	case instr_class::valu_double_convert: return {16, WAIT_USE(valu, 16)};
	case instr_class::valu_double_transcendental: return {64, WAIT_USE(valu, 64)};
	case instr_class::salu: return {4, WAIT_USE(scalar, 4)};
	case instr_class::smem: return {4, WAIT_USE(scalar, 4)};
	case instr_class::branch: return {4, WAIT_USE(branch_sendmsg, 4)};
	case instr_class::ds:
	return instr.isDS() && instr.ds().gds ? perf_info{4, WAIT_USE(export_gds, 4)}
	: perf_info{4, WAIT_USE(lds, 4)};
	case instr_class::exp: return {16, WAIT_USE(export_gds, 16)};
	case instr_class::vmem: return {4, WAIT_USE(vmem, 4)};
	case instr_class::barrier:
	case instr_class::waitcnt:
	case instr_class::other:
	default: return {4};
	}
	}

	#undef WAIT_USE
	#undef WAIT
	}

	void
	BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
	{
	perf_info perf = get_perf_info(program, instr);

	if (perf.rsrc0 != resource_count) {
	res_available[(int)perf.rsrc0] = cur_cycle + perf.cost0;
	res_usage[(int)perf.rsrc0] += perf.cost0;
	}

	if (perf.rsrc1 != resource_count) {
	res_available[(int)perf.rsrc1] = cur_cycle + perf.cost1;
	res_usage[(int)perf.rsrc1] += perf.cost1;
	}
	}

	int32_t
	BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr)
	{
	perf_info perf = get_perf_info(program, instr);

	int32_t cost = 0;
	if (perf.rsrc0 != resource_count)
	cost = MAX2(cost, res_available[(int)perf.rsrc0] - cur_cycle);
	if (perf.rsrc1 != resource_count)
	cost = MAX2(cost, res_available[(int)perf.rsrc1] - cur_cycle);

	return cost;
	}

	static std::array<unsigned, wait_type_num>
	get_wait_counter_info(Program* program, aco_ptr<Instruction>& instr)
	{
	/* These numbers are all a bit nonsense. LDS/VMEM/SMEM/EXP performance
	* depends a lot on the situation. */

	std::array<unsigned, wait_type_num> info{};

	if (instr->isEXP()) {
	info[wait_type_exp] = 16;
	} else if (instr->isLDSDIR()) {
	info[wait_type_exp] = 13;
	} else if (instr->isFlatLike()) {
	info[wait_type_lgkm] = instr->isFlat() ? 20 : 0;
	if (!instr->definitions.empty() \|\| program->gfx_level < GFX10)
	info[wait_type_vm] = 320;
	else
	info[wait_type_vs] = 320;
	} else if (instr->isSMEM()) {
	wait_type type = program->gfx_level >= GFX12 ? wait_type_km : wait_type_lgkm;
	if (instr->definitions.empty()) {
	info[type] = 200;
	} else if (instr->operands.empty()) { /* s_memtime and s_memrealtime */
	info[type] = 1;
	} else {
	bool likely_desc_load = instr->operands[0].size() == 2;
	bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);
	bool const_offset =
	instr->operands[1].isConstant() && (!soe \|\| instr->operands.back().isConstant());

	if (likely_desc_load \|\| const_offset)
	info[type] = 30; /* likely to hit L0 cache */
	else
	info[type] = 200;
	}
	} else if (instr->isDS()) {
	info[wait_type_lgkm] = 20;
	} else if (instr->isVMEM() && instr->definitions.empty() && program->gfx_level >= GFX10) {
	info[wait_type_vs] = 320;
	} else if (instr->isVMEM()) {
	uint8_t vm_type = get_vmem_type(program->gfx_level, program->family, instr.get());
	wait_type type = wait_type_vm;
	if (program->gfx_level >= GFX12 && vm_type == vmem_bvh)
	type = wait_type_bvh;
	else if (program->gfx_level >= GFX12 && vm_type == vmem_sampler)
	type = wait_type_sample;
	info[type] = 320;
	}

	return info;
	}

	static wait_imm
	get_wait_imm(Program* program, aco_ptr<Instruction>& instr)
	{
	wait_imm imm;
	if (instr->opcode == aco_opcode::s_endpgm) {
	for (unsigned i = 0; i < wait_type_num; i++)
	imm[i] = 0;
	} else if (imm.unpack(program->gfx_level, instr.get())) {
	} else if (instr->isVINTERP_INREG()) {
	imm.exp = instr->vinterp_inreg().wait_exp;
	if (imm.exp == 0x7)
	imm.exp = wait_imm::unset_counter;
	} else {
	/* If an instruction increases a counter, it waits for it to be below maximum first. */
	std::array<unsigned, wait_type_num> wait_info = get_wait_counter_info(program, instr);
	wait_imm max = wait_imm::max(program->gfx_level);
	for (unsigned i = 0; i < wait_type_num; i++) {
	if (wait_info[i])
	imm[i] = max[i] - 1;
	}
	}
	return imm;
	}

	unsigned
	BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
	{
	int deps_available = cur_cycle;

	wait_imm imm = get_wait_imm(program, instr);
	for (unsigned i = 0; i < wait_type_num; i++) {
	if (imm[i] == wait_imm::unset_counter)
	continue;
	for (int j = 0; j < (int)mem_ops[i].size() - imm[i]; j++)
	deps_available = MAX2(deps_available, mem_ops[i][j]);
	}

	if (instr->opcode == aco_opcode::s_endpgm) {
	for (unsigned i = 0; i < 512; i++)
	deps_available = MAX2(deps_available, reg_available[i]);
	} else if (program->gfx_level >= GFX10) {
	for (Operand& op : instr->operands) {
	if (op.isConstant() \|\| op.isUndefined())
	continue;
	for (unsigned i = 0; i < op.size(); i++)
	deps_available = MAX2(deps_available, reg_available[op.physReg().reg() + i]);
	}
	}

	if (program->gfx_level < GFX10)
	deps_available = align(deps_available, 4);

	return deps_available - cur_cycle;
	}

	static bool
	is_vector(aco_opcode op)
	{
	switch (instr_info.classes[(int)op]) {
	case instr_class::valu32:
	case instr_class::valu_convert32:
	case instr_class::valu_fma:
	case instr_class::valu_double:
	case instr_class::valu_double_add:
	case instr_class::valu_double_convert:
	case instr_class::valu_double_transcendental:
	case instr_class::vmem:
	case instr_class::ds:
	case instr_class::exp:
	case instr_class::valu64:
	case instr_class::valu_quarter_rate32:
	case instr_class::valu_transcendental32: return true;
	default: return false;
	}
	}

	void
	BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
	{
	perf_info perf = get_perf_info(program, instr);

	cur_cycle += get_dependency_cost(instr);

	unsigned start;
	bool dual_issue = program->gfx_level >= GFX10 && program->wave_size == 64 &&
	is_vector(instr->opcode) && !is_dual_issue_capable(program, instr) &&
	program->workgroup_size > 32;
	for (unsigned i = 0; i < (dual_issue ? 2 : 1); i++) {
	cur_cycle += cycles_until_res_available(instr);

	start = cur_cycle;
	use_resources(instr);

	/* GCN is in-order and doesn't begin the next instruction until the current one finishes */
	cur_cycle += program->gfx_level >= GFX10 ? 1 : perf.latency;
	}

	wait_imm imm = get_wait_imm(program, instr);
	for (unsigned i = 0; i < wait_type_num; i++) {
	while (mem_ops[i].size() > imm[i])
	mem_ops[i].pop_front();
	}

	std::array<unsigned, wait_type_num> wait_info = get_wait_counter_info(program, instr);
	for (unsigned i = 0; i < wait_type_num; i++) {
	if (wait_info[i])
	mem_ops[i].push_back(cur_cycle + wait_info[i]);
	}

	/* This is inaccurate but shouldn't affect anything after waitcnt insertion.
	* Before waitcnt insertion, this is necessary to consider memory operations.
	*/
	unsigned latency = 0;
	for (unsigned i = 0; i < wait_type_num; i++)
	latency = MAX2(latency, i == wait_type_vs ? 0 : wait_info[i]);
	int32_t result_available = start + MAX2(perf.latency, (int32_t)latency);

	for (Definition& def : instr->definitions) {
	int32_t* available = &reg_available[def.physReg().reg()];
	for (unsigned i = 0; i < def.size(); i++)
	available[i] = MAX2(available[i], result_available);
	}
	}

	void
	BlockCycleEstimator::join(const BlockCycleEstimator& pred)
	{
	assert(cur_cycle == 0);

	for (unsigned i = 0; i < (unsigned)resource_count; i++) {
	assert(res_usage[i] == 0);
	res_available[i] = MAX2(res_available[i], pred.res_available[i] - pred.cur_cycle);
	}

	for (unsigned i = 0; i < 512; i++)
	reg_available[i] = MAX2(reg_available[i], pred.reg_available[i] - pred.cur_cycle + cur_cycle);

	for (unsigned i = 0; i < wait_type_num; i++) {
	std::deque<int32_t>& ops = mem_ops[i];
	const std::deque<int32_t>& pred_ops = pred.mem_ops[i];
	for (unsigned j = 0; j < MIN2(ops.size(), pred_ops.size()); j++)
	ops.rbegin()[j] = MAX2(ops.rbegin()[j], pred_ops.rbegin()[j] - pred.cur_cycle);
	for (int j = pred_ops.size() - ops.size() - 1; j >= 0; j--)
	ops.push_front(pred_ops[j] - pred.cur_cycle);
	}
	}

	} /* end namespace */

	/* sgpr_presched/vgpr_presched */
	void
	collect_presched_stats(Program* program)
	{
	RegisterDemand presched_demand;
	for (Block& block : program->blocks)
	presched_demand.update(block.register_demand);
	program->statistics.presgprs = presched_demand.sgpr;
	program->statistics.prevgprs = presched_demand.vgpr;
	}

	/* instructions/branches/vmem_clauses/smem_clauses/cycles */
	void
	collect_preasm_stats(Program* program)
	{
	for (Block& block : program->blocks) {
	std::set<Instruction*> vmem_clause;
	std::set<Instruction*> smem_clause;

	program->statistics.instrs += block.instructions.size();

	for (aco_ptr<Instruction>& instr : block.instructions) {
	const bool is_branch =
	instr->isSOPP() && instr_info.classes[(int)instr->opcode] == instr_class::branch;
	if (is_branch)
	program->statistics.branches++;

	if (instr->isVALU() \|\| instr->isVINTRP())
	program->statistics.valu++;
	if (instr->isSALU() && !instr->isSOPP() &&
	instr_info.classes[(int)instr->opcode] != instr_class::waitcnt)
	program->statistics.salu++;
	if (instr->isVOPD())
	program->statistics.vopd++;

	if ((instr->isVMEM() \|\| instr->isFlatLike()) && !instr->operands.empty()) {
	if (std::none_of(vmem_clause.begin(), vmem_clause.end(),
	[&](Instruction* other)
	{ return should_form_clause(instr.get(), other); }))
	program->statistics.vclause++;
	vmem_clause.insert(instr.get());

	program->statistics.vmem++;
	} else {
	vmem_clause.clear();
	}

	if (instr->isSMEM() && !instr->operands.empty()) {
	if (std::none_of(smem_clause.begin(), smem_clause.end(),
	[&](Instruction* other)
	{ return should_form_clause(instr.get(), other); }))
	program->statistics.sclause++;
	smem_clause.insert(instr.get());

	program->statistics.smem++;
	} else {
	smem_clause.clear();
	}
	}
	}

	double latency = 0;
	double usage[(int)BlockCycleEstimator::resource_count] = {0};
	std::vector<BlockCycleEstimator> blocks(program->blocks.size(), program);

	constexpr const unsigned vmem_latency = 320;
	for (const Definition def : program->args_pending_vmem) {
	blocks[0].mem_ops[wait_type_vm].push_back(vmem_latency);
	for (unsigned i = 0; i < def.size(); i++)
	blocks[0].reg_available[def.physReg().reg() + i] = vmem_latency;
	}

	for (Block& block : program->blocks) {
	BlockCycleEstimator& block_est = blocks[block.index];
	for (unsigned pred : block.linear_preds)
	block_est.join(blocks[pred]);

	for (aco_ptr<Instruction>& instr : block.instructions) {
	unsigned before = block_est.cur_cycle;
	block_est.add(instr);
	instr->pass_flags = block_est.cur_cycle - before;
	}

	/* TODO: it would be nice to be able to consider estimated loop trip
	* counts used for loop unrolling.
	*/

	/* TODO: estimate the trip_count of divergent loops (those which break
	* divergent) higher than of uniform loops
	*/

	/* Assume loops execute 8-2 times, uniform branches are taken 50% the time,
	* and any lane in the wave takes a side of a divergent branch 75% of the
	* time.
	*/
	double iter = 1.0;
	iter *= block.loop_nest_depth > 0 ? 8.0 : 1.0;
	iter *= block.loop_nest_depth > 1 ? 4.0 : 1.0;
	iter *= block.loop_nest_depth > 2 ? pow(2.0, block.loop_nest_depth - 2) : 1.0;
	iter *= pow(0.5, block.uniform_if_depth);
	iter *= pow(0.75, block.divergent_if_logical_depth);

	bool divergent_if_linear_else =
	block.logical_preds.empty() && block.linear_preds.size() == 1 &&
	block.linear_succs.size() == 1 &&
	program->blocks[block.linear_preds[0]].kind & (block_kind_branch \| block_kind_invert);
	if (divergent_if_linear_else)
	iter *= 0.25;

	latency += block_est.cur_cycle * iter;
	for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++)
	usage[i] += block_est.res_usage[i] * iter;
	}

	/* This likely exaggerates the effectiveness of parallelism because it
	* ignores instruction ordering. It can assume there might be SALU/VALU/etc
	* work to from other waves while one is idle but that might not be the case
	* because those other waves have not reached such a point yet.
	*/

	double parallelism = program->num_waves;
	for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++) {
	if (usage[i] > 0.0)
	parallelism = MIN2(parallelism, latency / usage[i]);
	}
	double waves_per_cycle = 1.0 / latency * parallelism;
	double wave64_per_cycle = waves_per_cycle * (program->wave_size / 64.0);

	double max_utilization = 1.0;
	if (program->workgroup_size != UINT_MAX)
	max_utilization =
	program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);
	wave64_per_cycle *= max_utilization;

	program->statistics.latency = round(latency);
	program->statistics.invthroughput = round(1.0 / wave64_per_cycle);

	if (debug_flags & DEBUG_PERF_INFO) {
	aco_print_program(program, stderr, print_no_ssa \| print_perf_info);

	fprintf(stderr, "num_waves: %u\n", program->num_waves);
	fprintf(stderr, "salu_smem_usage: %f\n", usage[(int)BlockCycleEstimator::scalar]);
	fprintf(stderr, "branch_sendmsg_usage: %f\n",
	usage[(int)BlockCycleEstimator::branch_sendmsg]);
	fprintf(stderr, "valu_usage: %f\n", usage[(int)BlockCycleEstimator::valu]);
	fprintf(stderr, "valu_complex_usage: %f\n", usage[(int)BlockCycleEstimator::valu_complex]);
	fprintf(stderr, "lds_usage: %f\n", usage[(int)BlockCycleEstimator::lds]);
	fprintf(stderr, "export_gds_usage: %f\n", usage[(int)BlockCycleEstimator::export_gds]);
	fprintf(stderr, "vmem_usage: %f\n", usage[(int)BlockCycleEstimator::vmem]);
	fprintf(stderr, "latency: %f\n", latency);
	fprintf(stderr, "parallelism: %f\n", parallelism);
	fprintf(stderr, "max_utilization: %f\n", max_utilization);
	fprintf(stderr, "wave64_per_cycle: %f\n", wave64_per_cycle);
	fprintf(stderr, "\n");
	}
	}

	void
	collect_postasm_stats(Program* program, const std::vector<uint32_t>& code)
	{
	program->statistics.hash = util_hash_crc32(code.data(), code.size() * 4);
	}

	Instruction_cycle_info
	get_cycle_info(const Program& program, const Instruction& instr)
	{
	perf_info info = get_perf_info(program, instr);
	return Instruction_cycle_info{(unsigned)info.latency, std::max(info.cost0, info.cost1)};
	}

	} // namespace aco