| /* |
| * Copyright © 2020 Valve Corporation |
| * |
| * SPDX-License-Identifier: MIT |
| */ |
| #include "common/amdgfxregs.h" |
| |
| #include "helpers.h" |
| |
| using namespace aco; |
| |
| void |
| create_mubuf(unsigned offset, PhysReg dst = PhysReg(256), PhysReg vaddr = PhysReg(256)) |
| { |
| bld.mubuf(aco_opcode::buffer_load_dword, Definition(dst, v1), Operand(PhysReg(0), s4), |
| Operand(vaddr, v1), Operand::zero(), offset, true); |
| } |
| |
| void |
| create_mubuf_store(PhysReg src = PhysReg(256)) |
| { |
| bld.mubuf(aco_opcode::buffer_store_dword, Operand(PhysReg(0), s4), Operand(src, v1), |
| Operand::zero(), Operand(src, v1), 0, true); |
| } |
| |
| void |
| create_mimg(bool nsa, unsigned addrs, unsigned instr_dwords) |
| { |
| aco_ptr<Instruction> mimg{ |
| create_instruction(aco_opcode::image_sample, Format::MIMG, 3 + addrs, 1)}; |
| mimg->definitions[0] = Definition(PhysReg(256), v1); |
| mimg->operands[0] = Operand(PhysReg(0), s8); |
| mimg->operands[1] = Operand(PhysReg(0), s4); |
| mimg->operands[2] = Operand(v1); |
| for (unsigned i = 0; i < addrs; i++) |
| mimg->operands[3 + i] = Operand(PhysReg(256 + (nsa ? i * 2 : i)), v1); |
| mimg->mimg().dmask = 0x1; |
| mimg->mimg().dim = ac_image_2d; |
| |
| assert(get_mimg_nsa_dwords(mimg.get()) + 2 == instr_dwords); |
| |
| bld.insert(std::move(mimg)); |
| } |
| |
| void |
| create_bvh() |
| { |
| aco_ptr<Instruction> instr{ |
| create_instruction(aco_opcode::image_bvh64_intersect_ray, Format::MIMG, 8, 1)}; |
| instr->definitions[0] = Definition(PhysReg(256), v4); |
| instr->operands[0] = Operand(PhysReg(0), s4); |
| instr->operands[1] = Operand(s4); |
| instr->operands[2] = Operand(v1); |
| instr->operands[3] = Operand(PhysReg(256 + 0), v2); /* node */ |
| instr->operands[4] = Operand(PhysReg(256 + 2), v1); /* tmax */ |
| instr->operands[5] = Operand(PhysReg(256 + 3), v3); /* origin */ |
| instr->operands[6] = Operand(PhysReg(256 + 6), v3); /* dir */ |
| instr->operands[7] = Operand(PhysReg(256 + 9), v3); /* inv dir */ |
| instr->mimg().dmask = 0xf; |
| instr->mimg().unrm = true; |
| instr->mimg().r128 = true; |
| bld.insert(std::move(instr)); |
| } |
| |
| BEGIN_TEST(insert_nops.nsa_to_vmem_bug) |
| if (!setup_cs(NULL, GFX10)) |
| return; |
| |
| /* no nop needed because offset&6==0 */ |
| //>> p_unit_test 0 |
| //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d |
| //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:8 offen |
| bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); |
| create_mimg(true, 6, 4); |
| create_mubuf(8); |
| |
| /* nop needed */ |
| //! p_unit_test 1 |
| //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d |
| //! s_nop |
| //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u)); |
| create_mimg(true, 6, 4); |
| create_mubuf(4); |
| |
| /* no nop needed because the MIMG is not NSA */ |
| //! p_unit_test 2 |
| //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[1], %0:v[2], %0:v[3], %0:v[4], %0:v[5] 2d |
| //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u)); |
| create_mimg(false, 6, 2); |
| create_mubuf(4); |
| |
| /* no nop needed because there's already an instruction in-between */ |
| //! p_unit_test 3 |
| //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d |
| //! v_nop |
| //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u)); |
| create_mimg(true, 6, 4); |
| bld.vop1(aco_opcode::v_nop); |
| create_mubuf(4); |
| |
| /* no nop needed because the NSA instruction is under 4 dwords */ |
| //! p_unit_test 4 |
| //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2] 2d |
| //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u)); |
| create_mimg(true, 2, 3); |
| create_mubuf(4); |
| |
| /* NSA instruction and MUBUF/MTBUF in a different block */ |
| //! p_unit_test 5 |
| //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d |
| //! BB1 |
| //! /* logical preds: / linear preds: BB0, / kind: uniform, */ |
| //! s_nop |
| //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u)); |
| create_mimg(true, 6, 4); |
| bld.reset(program->create_and_insert_block()); |
| create_mubuf(4); |
| program->blocks[0].linear_succs.push_back(1); |
| program->blocks[1].linear_preds.push_back(0); |
| |
| finish_insert_nops_test(); |
| END_TEST |
| |
| BEGIN_TEST(insert_nops.writelane_to_nsa_bug) |
| if (!setup_cs(NULL, GFX10)) |
| return; |
| |
| /* nop needed */ |
| //>> p_unit_test 0 |
| //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255] |
| //! s_nop |
| //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2] 2d |
| bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); |
| bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(), |
| Operand(PhysReg(511), v1)); |
| create_mimg(true, 2, 3); |
| |
| /* no nop needed because the MIMG is not NSA */ |
| //! p_unit_test 1 |
| //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255] |
| //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[1] 2d |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u)); |
| bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(), |
| Operand(PhysReg(511), v1)); |
| create_mimg(false, 2, 2); |
| |
| /* no nop needed because there's already an instruction in-between */ |
| //! p_unit_test 2 |
| //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255] |
| //! v_nop |
| //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2] 2d |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u)); |
| bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(), |
| Operand(PhysReg(511), v1)); |
| bld.vop1(aco_opcode::v_nop); |
| create_mimg(true, 2, 3); |
| |
| /* writelane and NSA instruction in different blocks */ |
| //! p_unit_test 3 |
| //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255] |
| //! BB1 |
| //! /* logical preds: / linear preds: BB0, / kind: uniform, */ |
| //! s_nop |
| //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2] 2d |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u)); |
| bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(), |
| Operand(PhysReg(511), v1)); |
| bld.reset(program->create_and_insert_block()); |
| create_mimg(true, 2, 3); |
| program->blocks[0].linear_succs.push_back(1); |
| program->blocks[1].linear_preds.push_back(0); |
| |
| finish_insert_nops_test(); |
| END_TEST |
| |
| BEGIN_TEST(insert_nops.vmem_to_scalar_write) |
| if (!setup_cs(NULL, GFX10)) |
| return; |
| |
| /* WaR: VMEM load */ |
| //>> p_unit_test 0 |
| //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen |
| //! s_waitcnt_depctr vm_vsrc(0) |
| //! s1: %0:s[0] = s_mov_b32 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); |
| create_mubuf(0); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero()); |
| |
| //! p_unit_test 1 |
| //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen |
| //! s_waitcnt_depctr vm_vsrc(0) |
| //! s2: %0:exec = s_mov_b64 -1 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); |
| create_mubuf(0); |
| bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1)); |
| |
| /* no hazard: VMEM load */ |
| //! p_unit_test 2 |
| //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen |
| //! s1: %0:s[4] = s_mov_b32 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); |
| create_mubuf(0); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero()); |
| |
| /* no hazard: VMEM load with VALU in-between */ |
| //! p_unit_test 3 |
| //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen |
| //! v_nop |
| //! s1: %0:s[0] = s_mov_b32 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); |
| create_mubuf(0); |
| bld.vop1(aco_opcode::v_nop); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero()); |
| |
| /* WaR: LDS */ |
| //! p_unit_test 4 |
| //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0 |
| //! s_waitcnt_depctr vm_vsrc(0) |
| //! s1: %0:m0 = s_mov_b32 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); |
| bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1), |
| Operand(m0, s1)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero()); |
| |
| //! p_unit_test 5 |
| //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0 |
| //! s_waitcnt_depctr vm_vsrc(0) |
| //! s2: %0:exec = s_mov_b64 -1 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); |
| bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1), |
| Operand(m0, s1)); |
| bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1)); |
| |
| /* no hazard: LDS */ |
| //! p_unit_test 6 |
| //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0 |
| //! s1: %0:s[0] = s_mov_b32 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); |
| bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1), |
| Operand(m0, s1)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero()); |
| |
| /* no hazard: LDS with VALU in-between */ |
| //! p_unit_test 7 |
| //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0 |
| //! v_nop |
| //! s1: %0:m0 = s_mov_b32 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); |
| bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1), |
| Operand(m0, s1)); |
| bld.vop1(aco_opcode::v_nop); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero()); |
| |
| /* no hazard: VMEM/LDS with the correct waitcnt in-between */ |
| //! p_unit_test 8 |
| //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen |
| //! s_waitcnt vmcnt(0) |
| //! s1: %0:s[0] = s_mov_b32 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); |
| create_mubuf(0); |
| bld.sopp(aco_opcode::s_waitcnt, 0x3f70); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero()); |
| |
| //! p_unit_test 9 |
| //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen |
| //! s_waitcnt_vscnt %0:null imm:0 |
| //! s1: %0:s[0] = s_mov_b32 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); |
| create_mubuf_store(); |
| bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero()); |
| |
| //! p_unit_test 10 |
| //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0 |
| //! s_waitcnt lgkmcnt(0) |
| //! s1: %0:m0 = s_mov_b32 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); |
| bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1), |
| Operand(m0, s1)); |
| bld.sopp(aco_opcode::s_waitcnt, 0xc07f); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero()); |
| |
| /* VMEM/LDS with the wrong waitcnt in-between */ |
| //! p_unit_test 11 |
| //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen |
| //! s_waitcnt_vscnt %0:null imm:0 |
| //! s_waitcnt_depctr vm_vsrc(0) |
| //! s1: %0:s[0] = s_mov_b32 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11)); |
| create_mubuf(0); |
| bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero()); |
| |
| //! p_unit_test 12 |
| //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen |
| //! s_waitcnt lgkmcnt(0) |
| //! s_waitcnt_depctr vm_vsrc(0) |
| //! s1: %0:s[0] = s_mov_b32 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12)); |
| create_mubuf_store(); |
| bld.sopp(aco_opcode::s_waitcnt, 0xc07f); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero()); |
| |
| //! p_unit_test 13 |
| //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0 |
| //! s_waitcnt vmcnt(0) |
| //! s_waitcnt_depctr vm_vsrc(0) |
| //! s1: %0:m0 = s_mov_b32 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13)); |
| bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1), |
| Operand(m0, s1)); |
| bld.sopp(aco_opcode::s_waitcnt, 0x3f70); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero()); |
| |
| finish_insert_nops_test(); |
| END_TEST |
| |
| BEGIN_TEST(insert_nops.lds_direct_valu) |
| for (amd_gfx_level gfx : {GFX11, GFX12}) { |
| if (!setup_cs(NULL, gfx)) |
| continue; |
| |
| /* WaW */ |
| //>> p_unit_test 0 |
| //! v1: %0:v[0] = v_mov_b32 0 |
| //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| /* WaR */ |
| //! p_unit_test 1 |
| //! v1: %0:v[1] = v_mov_b32 %0:v[0] |
| //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| /* No hazard. */ |
| //! p_unit_test 2 |
| //! v1: %0:v[1] = v_mov_b32 0 |
| //! v1: %0:v[0] = lds_direct_load %0:m0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::zero()); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| /* multiples hazards, nearest should be considered */ |
| //! p_unit_test 3 |
| //! v1: %0:v[1] = v_mov_b32 %0:v[0] |
| //! v1: %0:v[0] = v_mov_b32 0 |
| //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| /* independent VALU increase wait_vdst */ |
| //! p_unit_test 4 |
| //! v1: %0:v[0] = v_mov_b32 0 |
| //! v_nop |
| //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); |
| bld.vop1(aco_opcode::v_nop); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| //! p_unit_test 5 |
| //! v1: %0:v[0] = v_mov_b32 0 |
| //; for i in range(10): insert_pattern('v_nop') |
| //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:10 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); |
| for (unsigned i = 0; i < 10; i++) |
| bld.vop1(aco_opcode::v_nop); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| //! p_unit_test 6 |
| //! v1: %0:v[0] = v_mov_b32 0 |
| //; for i in range(20): insert_pattern('v_nop') |
| //! v1: %0:v[0] = lds_direct_load %0:m0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); |
| for (unsigned i = 0; i < 20; i++) |
| bld.vop1(aco_opcode::v_nop); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| /* transcendental requires wait_vdst=0 */ |
| //! p_unit_test 7 |
| //! v1: %0:v[0] = v_mov_b32 0 |
| //! v_nop |
| //! v1: %0:v[1] = v_sqrt_f32 %0:v[1] |
| //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); |
| bld.vop1(aco_opcode::v_nop); |
| bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1)); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| //! p_unit_test 8 |
| //! v1: %0:v[0] = v_sqrt_f32 %0:v[0] |
| //! v_nop |
| //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); |
| bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1)); |
| bld.vop1(aco_opcode::v_nop); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| /* transcendental is fine if it's before the instruction */ |
| //! p_unit_test 9 |
| //! v1: %0:v[1] = v_sqrt_f32 %0:v[1] |
| //! v1: %0:v[0] = v_mov_b32 0 |
| //! v_nop |
| //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); |
| bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); |
| bld.vop1(aco_opcode::v_nop); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| /* non-VALU does not increase wait_vdst */ |
| //! p_unit_test 10 |
| //! v1: %0:v[0] = v_mov_b32 0 |
| //! s1: %0:m0 = s_mov_b32 0 |
| //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero()); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| /* consider instructions which wait on vdst */ |
| //! p_unit_test 11 |
| //! v1: %0:v[0] = v_mov_b32 0 |
| //! v_nop |
| //! s_waitcnt_depctr va_vdst(0) |
| //! v1: %0:v[0] = lds_direct_load %0:m0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); |
| bld.vop1(aco_opcode::v_nop); |
| bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| finish_insert_nops_test(); |
| } |
| END_TEST |
| |
| BEGIN_TEST(insert_nops.lds_direct_vmem) |
| for (amd_gfx_level gfx : {GFX11, GFX12}) { |
| if (!setup_cs(NULL, gfx)) |
| continue; |
| |
| /* WaR: VMEM */ |
| //>> p_unit_test 0 |
| //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen |
| //~gfx11! s_waitcnt_depctr vm_vsrc(0) |
| //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0 |
| //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); |
| create_mubuf(0, PhysReg(257)); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| /* WaW: VMEM */ |
| //! p_unit_test 1 |
| //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen |
| //~gfx11! s_waitcnt_depctr vm_vsrc(0) |
| //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0 |
| //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); |
| create_mubuf(0, PhysReg(256), PhysReg(257)); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| /* no hazard: VMEM */ |
| //! p_unit_test 2 |
| //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen |
| //! v1: %0:v[0] = lds_direct_load %0:m0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); |
| create_mubuf(0, PhysReg(257), PhysReg(257)); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| /* no hazard: VMEM with VALU in-between */ |
| //! p_unit_test 3 |
| //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen |
| //! v_nop |
| //! v1: %0:v[0] = lds_direct_load %0:m0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); |
| create_mubuf(0, PhysReg(257)); |
| bld.vop1(aco_opcode::v_nop); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| /* WaR: LDS */ |
| //! p_unit_test 4 |
| //! v1: %0:v[1] = ds_read_b32 %0:v[0] |
| //~gfx11! s_waitcnt_depctr vm_vsrc(0) |
| //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0 |
| //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); |
| bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| /* WaW: LDS */ |
| //! p_unit_test 5 |
| //! v1: %0:v[0] = ds_read_b32 %0:v[1] |
| //~gfx11! s_waitcnt_depctr vm_vsrc(0) |
| //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0 |
| //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); |
| bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1)); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| /* no hazard: LDS */ |
| //! p_unit_test 6 |
| //! v1: %0:v[1] = ds_read_b32 %0:v[1] |
| //! v1: %0:v[0] = lds_direct_load %0:m0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); |
| bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1)); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| /* no hazard: LDS with VALU in-between */ |
| //! p_unit_test 7 |
| //! v1: %0:v[1] = ds_read_b32 %0:v[0] |
| //! v_nop |
| //! v1: %0:v[0] = lds_direct_load %0:m0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); |
| bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); |
| bld.vop1(aco_opcode::v_nop); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| /* no hazard: VMEM/LDS with the correct waitcnt in-between */ |
| //! p_unit_test 8 |
| //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen |
| //~gfx11! s_waitcnt vmcnt(0) |
| //~gfx12! s_wait_loadcnt imm:0 |
| //! v1: %0:v[0] = lds_direct_load %0:m0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); |
| create_mubuf(0, PhysReg(257)); |
| if (gfx >= GFX12) |
| bld.sopp(aco_opcode::s_wait_loadcnt, 0); |
| else |
| bld.sopp(aco_opcode::s_waitcnt, 0x3ff); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| //! p_unit_test 9 |
| //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen |
| //~gfx11! s_waitcnt_vscnt %0:null imm:0 |
| //~gfx12! s_wait_storecnt imm:0 |
| //! v1: %0:v[0] = lds_direct_load %0:m0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); |
| create_mubuf_store(); |
| if (gfx >= GFX12) |
| bld.sopp(aco_opcode::s_wait_storecnt, 0); |
| else |
| bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| //! p_unit_test 10 |
| //! v1: %0:v[1] = ds_read_b32 %0:v[0] |
| //~gfx11! s_waitcnt lgkmcnt(0) |
| //~gfx12! s_wait_dscnt imm:0 |
| //! v1: %0:v[0] = lds_direct_load %0:m0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); |
| bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); |
| if (gfx >= GFX12) |
| bld.sopp(aco_opcode::s_wait_dscnt, 0); |
| else |
| bld.sopp(aco_opcode::s_waitcnt, 0xfc0f); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| if (gfx >= GFX12) { |
| //~gfx12! p_unit_test 11 |
| //~gfx12! v1: %0:v[1] = image_load %0:s[0-7], s4: undef, v1: undef, %0:v[0-1] 2d |
| //~gfx12! s_wait_loadcnt imm:0 |
| //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11)); |
| Instruction* instr = |
| bld.mimg(aco_opcode::image_load, Definition(PhysReg(257), v1), Operand(PhysReg(0), s8), |
| Operand(s4), Operand(v1), Operand(PhysReg(256), v2)) |
| .instr; |
| instr->mimg().dmask = 0x1; |
| instr->mimg().dim = ac_image_2d; |
| bld.sopp(aco_opcode::s_wait_loadcnt, 0); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| //~gfx12! p_unit_test 12 |
| //~gfx12! v1: %0:v[1] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0-1] 2d |
| //~gfx12! s_wait_samplecnt imm:0 |
| //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12)); |
| instr = bld.mimg(aco_opcode::image_sample, Definition(PhysReg(257), v1), |
| Operand(PhysReg(0), s8), Operand(PhysReg(0), s4), Operand(v1), |
| Operand(PhysReg(256), v2)) |
| .instr; |
| instr->mimg().dmask = 0x1; |
| instr->mimg().dim = ac_image_2d; |
| bld.sopp(aco_opcode::s_wait_samplecnt, 0); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| //~gfx12! p_unit_test 13 |
| //~gfx12! v4: %0:v[0-3] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[0-1], %0:v[2], %0:v[3-5], %0:v[6-8], %0:v[9-11] 1d unrm r128 |
| //~gfx12! s_wait_bvhcnt imm:0 |
| //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13)); |
| create_bvh(); |
| bld.sopp(aco_opcode::s_wait_bvhcnt, 0); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| } |
| |
| /* VMEM/LDS with the wrong waitcnt in-between */ |
| //! p_unit_test 14 |
| //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen |
| //~gfx11! s_waitcnt_vscnt %0:null imm:0 |
| //~gfx11! s_waitcnt_depctr vm_vsrc(0) |
| //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0 |
| //~gfx12! s_wait_storecnt imm:0 |
| //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14)); |
| create_mubuf(0, PhysReg(257)); |
| if (gfx >= GFX12) |
| bld.sopp(aco_opcode::s_wait_storecnt, 0); |
| else |
| bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| //! p_unit_test 15 |
| //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen |
| //~gfx11! s_waitcnt lgkmcnt(0) |
| //~gfx11! s_waitcnt_depctr vm_vsrc(0) |
| //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0 |
| //~gfx12! s_wait_dscnt imm:0 |
| //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15)); |
| create_mubuf_store(); |
| if (gfx >= GFX12) |
| bld.sopp(aco_opcode::s_wait_dscnt, 0); |
| else |
| bld.sopp(aco_opcode::s_waitcnt, 0xfc0f); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| //! p_unit_test 16 |
| //! v1: %0:v[1] = ds_read_b32 %0:v[0] |
| //~gfx11! s_waitcnt vmcnt(0) |
| //~gfx11! s_waitcnt_depctr vm_vsrc(0) |
| //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0 |
| //~gfx12! s_wait_loadcnt imm:0 |
| //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(16)); |
| bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); |
| if (gfx >= GFX12) |
| bld.sopp(aco_opcode::s_wait_loadcnt, 0); |
| else |
| bld.sopp(aco_opcode::s_waitcnt, 0x3ff); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| //! p_unit_test 17 |
| //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen |
| //~gfx11! s_waitcnt_vscnt %0:null imm:0 |
| //~gfx11! s_waitcnt_depctr vm_vsrc(0) |
| //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0 |
| //~gfx12! s_wait_storecnt imm:0 |
| //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(17)); |
| create_mubuf(0, PhysReg(256), PhysReg(257)); |
| if (gfx >= GFX12) |
| bld.sopp(aco_opcode::s_wait_storecnt, 0); |
| else |
| bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| if (gfx >= GFX12) { |
| //~gfx12! p_unit_test 18 |
| //~gfx12! v1: %0:v[1] = image_load %0:s[0-7], s4: undef, v1: undef, %0:v[0-1] 2d |
| //~gfx12! s_wait_samplecnt imm:0 |
| //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(18)); |
| Instruction* instr = |
| bld.mimg(aco_opcode::image_load, Definition(PhysReg(257), v1), Operand(PhysReg(0), s8), |
| Operand(s4), Operand(v1), Operand(PhysReg(256), v2)) |
| .instr; |
| instr->mimg().dmask = 0x1; |
| instr->mimg().dim = ac_image_2d; |
| bld.sopp(aco_opcode::s_wait_samplecnt, 0); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| //~gfx12! p_unit_test 19 |
| //~gfx12! v1: %0:v[1] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0-1] 2d |
| //~gfx12! s_wait_loadcnt imm:0 |
| //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(19)); |
| instr = bld.mimg(aco_opcode::image_sample, Definition(PhysReg(257), v1), |
| Operand(PhysReg(0), s8), Operand(PhysReg(0), s4), Operand(v1), |
| Operand(PhysReg(256), v2)) |
| .instr; |
| instr->mimg().dmask = 0x1; |
| instr->mimg().dim = ac_image_2d; |
| bld.sopp(aco_opcode::s_wait_loadcnt, 0); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| |
| //~gfx12! p_unit_test 20 |
| //~gfx12! v4: %0:v[0-3] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[0-1], %0:v[2], %0:v[3-5], %0:v[6-8], %0:v[9-11] 1d unrm r128 |
| //~gfx12! s_wait_loadcnt imm:0 |
| //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(20)); |
| create_bvh(); |
| bld.sopp(aco_opcode::s_wait_loadcnt, 0); |
| bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| } |
| |
| finish_insert_nops_test(); |
| } |
| END_TEST |
| |
| BEGIN_TEST(insert_nops.valu_trans_use) |
| if (!setup_cs(NULL, GFX11)) |
| return; |
| |
| //>> p_unit_test 0 |
| //! v1: %0:v[0] = v_rcp_f32 %0:v[1] |
| //! s_waitcnt_depctr va_vdst(0) |
| //! v1: %0:v[1] = v_mov_b32 %0:v[0] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); |
| bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); |
| |
| /* Sufficient VALU mitigates the hazard. */ |
| //! p_unit_test 1 |
| //! v1: %0:v[0] = v_rcp_f32 %0:v[1] |
| //; for i in range(4): insert_pattern('v_nop') |
| //! s_waitcnt_depctr va_vdst(0) |
| //! v1: %0:v[1] = v_mov_b32 %0:v[0] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); |
| bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1)); |
| for (unsigned i = 0; i < 4; i++) |
| bld.vop1(aco_opcode::v_nop); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); |
| |
| //! p_unit_test 2 |
| //! v1: %0:v[0] = v_rcp_f32 %0:v[1] |
| //; for i in range(8): insert_pattern('v_nop') |
| //! v1: %0:v[1] = v_mov_b32 %0:v[0] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); |
| bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1)); |
| for (unsigned i = 0; i < 8; i++) |
| bld.vop1(aco_opcode::v_nop); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); |
| |
| /* Sufficient transcendental VALU mitigates the hazard. */ |
| //! p_unit_test 3 |
| //! v1: %0:v[0] = v_rcp_f32 %0:v[1] |
| //! v1: %0:v[2] = v_sqrt_f32 %0:v[3] |
| //! s_waitcnt_depctr va_vdst(0) |
| //! v1: %0:v[1] = v_mov_b32 %0:v[0] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); |
| bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1)); |
| bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); |
| |
| //! p_unit_test 4 |
| //! v1: %0:v[0] = v_rcp_f32 %0:v[1] |
| //! v1: %0:v[2] = v_sqrt_f32 %0:v[3] |
| //! v1: %0:v[2] = v_sqrt_f32 %0:v[3] |
| //! v1: %0:v[1] = v_mov_b32 %0:v[0] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); |
| bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1)); |
| for (unsigned i = 0; i < 2; i++) |
| bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); |
| |
| /* Transcendental VALU should be counted towards VALU */ |
| //! p_unit_test 5 |
| //! v1: %0:v[0] = v_rcp_f32 %0:v[1] |
| //; for i in range(5): insert_pattern('v_nop') |
| //! v1: %0:v[2] = v_sqrt_f32 %0:v[3] |
| //! v1: %0:v[1] = v_mov_b32 %0:v[0] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); |
| bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1)); |
| for (unsigned i = 0; i < 5; i++) |
| bld.vop1(aco_opcode::v_nop); |
| bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); |
| |
| /* non-VALU does not mitigate the hazard. */ |
| //! p_unit_test 6 |
| //! v1: %0:v[0] = v_rcp_f32 %0:v[1] |
| //; for i in range(8): insert_pattern('s_nop') |
| //! s_waitcnt_depctr va_vdst(0) |
| //! v1: %0:v[1] = v_mov_b32 %0:v[0] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); |
| bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1)); |
| for (unsigned i = 0; i < 8; i++) |
| bld.sopp(aco_opcode::s_nop, 0); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); |
| |
| finish_insert_nops_test(); |
| END_TEST |
| |
| BEGIN_TEST(insert_nops.valu_partial_forwarding.basic) |
| if (!setup_cs(NULL, GFX11)) |
| return; |
| |
| /* Basic case. */ |
| //>> p_unit_test 0 |
| //! v1: %0:v[0] = v_mov_b32 0 |
| //! s2: %0:exec = s_mov_b64 -1 |
| //! v1: %0:v[1] = v_mov_b32 1 |
| //! s_waitcnt_depctr va_vdst(0) |
| //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1)); |
| bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1), |
| Operand(PhysReg(257), v1)); |
| |
| /* We should consider both the closest and further VALU after the exec write. */ |
| //! p_unit_test 1 |
| //! v1: %0:v[0] = v_mov_b32 0 |
| //! s2: %0:exec = s_mov_b64 -1 |
| //! v1: %0:v[1] = v_mov_b32 1 |
| //; for i in range(2): insert_pattern('v_nop') |
| //! v1: %0:v[2] = v_mov_b32 2 |
| //! s_waitcnt_depctr va_vdst(0) |
| //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1)); |
| bld.vop1(aco_opcode::v_nop); |
| bld.vop1(aco_opcode::v_nop); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2)); |
| bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1), |
| Operand(PhysReg(257), v1), Operand(PhysReg(258), v1)); |
| |
| //! p_unit_test 2 |
| //! v1: %0:v[0] = v_mov_b32 0 |
| //! s2: %0:exec = s_mov_b64 -1 |
| //! v1: %0:v[1] = v_mov_b32 1 |
| //! v1: %0:v[2] = v_mov_b32 2 |
| //; for i in range(4): insert_pattern('v_nop') |
| //! s_waitcnt_depctr va_vdst(0) |
| //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2)); |
| for (unsigned i = 0; i < 4; i++) |
| bld.vop1(aco_opcode::v_nop); |
| bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1), |
| Operand(PhysReg(257), v1), Operand(PhysReg(258), v1)); |
| |
| /* If a VALU writes a read VGPR in-between the first and second writes, it should still be |
| * counted towards the distance between the first and second writes. |
| */ |
| //! p_unit_test 3 |
| //! v1: %0:v[0] = v_mov_b32 0 |
| //! s2: %0:exec = s_mov_b64 -1 |
| //! v1: %0:v[1] = v_mov_b32 1 |
| //; for i in range(2): insert_pattern('v_nop') |
| //! v1: %0:v[2] = v_mov_b32 2 |
| //; for i in range(3): insert_pattern('v_nop') |
| //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1)); |
| bld.vop1(aco_opcode::v_nop); |
| bld.vop1(aco_opcode::v_nop); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2)); |
| for (unsigned i = 0; i < 3; i++) |
| bld.vop1(aco_opcode::v_nop); |
| bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1), |
| Operand(PhysReg(257), v1), Operand(PhysReg(258), v1)); |
| |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); |
| |
| finish_insert_nops_test(); |
| END_TEST |
| |
| BEGIN_TEST(insert_nops.valu_partial_forwarding.multiple_exec_writes) |
| if (!setup_cs(NULL, GFX11)) |
| return; |
| |
| //>> p_unit_test 0 |
| //! v1: %0:v[0] = v_mov_b32 0 |
| //! s2: %0:exec = s_mov_b64 0 |
| //! s2: %0:exec = s_mov_b64 -1 |
| //! v1: %0:v[1] = v_mov_b32 1 |
| //! s_waitcnt_depctr va_vdst(0) |
| //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(0)); |
| bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1)); |
| bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1), |
| Operand(PhysReg(257), v1)); |
| |
| //! p_unit_test 1 |
| //! v1: %0:v[0] = v_mov_b32 0 |
| //! s2: %0:exec = s_mov_b64 0 |
| //! v1: %0:v[1] = v_mov_b32 1 |
| //! s2: %0:exec = s_mov_b64 -1 |
| //! s_waitcnt_depctr va_vdst(0) |
| //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(0)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1)); |
| bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1)); |
| bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1), |
| Operand(PhysReg(257), v1)); |
| |
| finish_insert_nops_test(); |
| END_TEST |
| |
| BEGIN_TEST(insert_nops.valu_partial_forwarding.control_flow) |
| if (!setup_cs(NULL, GFX11)) |
| return; |
| |
| /* Control flow merges: one branch shouldn't interfere with the other (clobbering VALU closer |
| * than interesting one). |
| */ |
| //>> p_unit_test 0 |
| //! s_cbranch_scc1 block:BB2 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0u)); |
| bld.sopp(aco_opcode::s_cbranch_scc1, 2); |
| |
| //! BB1 |
| //! /* logical preds: / linear preds: BB0, / kind: */ |
| //! v1: %0:v[0] = v_mov_b32 0 |
| //! s2: %0:exec = s_mov_b64 -1 |
| //! v_nop |
| //! s_branch block:BB3 |
| bld.reset(program->create_and_insert_block()); |
| program->blocks[0].linear_succs.push_back(1); |
| program->blocks[1].linear_preds.push_back(0); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1)); |
| bld.vop1(aco_opcode::v_nop); |
| bld.sopp(aco_opcode::s_branch, 3); |
| |
| //! BB2 |
| //! /* logical preds: / linear preds: BB0, / kind: */ |
| //! v1: %0:v[0] = v_mov_b32 0 |
| bld.reset(program->create_and_insert_block()); |
| program->blocks[0].linear_succs.push_back(2); |
| program->blocks[2].linear_preds.push_back(0); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); |
| |
| //! BB3 |
| //! /* logical preds: / linear preds: BB1, BB2, / kind: */ |
| //! v1: %0:v[1] = v_mov_b32 1 |
| //! s_waitcnt_depctr va_vdst(0) |
| //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1] |
| bld.reset(program->create_and_insert_block()); |
| program->blocks[1].linear_succs.push_back(3); |
| program->blocks[2].linear_succs.push_back(3); |
| program->blocks[3].linear_preds.push_back(1); |
| program->blocks[3].linear_preds.push_back(2); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1)); |
| bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1), |
| Operand(PhysReg(257), v1)); |
| |
| /* Control flow merges: one branch shouldn't interfere with the other (should consider furthest |
| * VALU writes after exec). |
| */ |
| //! p_unit_test 1 |
| //! s_cbranch_scc1 block:BB5 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u)); |
| bld.sopp(aco_opcode::s_cbranch_scc1, 5); |
| |
| //! BB4 |
| //! /* logical preds: / linear preds: BB3, / kind: */ |
| //! v1: %0:v[0] = v_mov_b32 0 |
| //! s2: %0:exec = s_mov_b64 -1 |
| //; for i in range(2): insert_pattern('v_nop') |
| //! v1: %0:v[1] = v_mov_b32 1 |
| //! v_nop |
| //! s_branch block:BB6 |
| bld.reset(program->create_and_insert_block()); |
| program->blocks[3].linear_succs.push_back(4); |
| program->blocks[4].linear_preds.push_back(3); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1)); |
| bld.vop1(aco_opcode::v_nop); |
| bld.vop1(aco_opcode::v_nop); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1)); |
| bld.vop1(aco_opcode::v_nop); |
| bld.sopp(aco_opcode::s_branch, 6); |
| |
| //! BB5 |
| //! /* logical preds: / linear preds: BB3, / kind: */ |
| //! v1: %0:v[1] = v_mov_b32 1 |
| bld.reset(program->create_and_insert_block()); |
| program->blocks[3].linear_succs.push_back(5); |
| program->blocks[5].linear_preds.push_back(3); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1)); |
| |
| //! BB6 |
| //! /* logical preds: / linear preds: BB4, BB5, / kind: */ |
| //! s_waitcnt_depctr va_vdst(0) |
| //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1] |
| bld.reset(program->create_and_insert_block()); |
| program->blocks[4].linear_succs.push_back(6); |
| program->blocks[5].linear_succs.push_back(6); |
| program->blocks[6].linear_preds.push_back(4); |
| program->blocks[6].linear_preds.push_back(5); |
| bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1), |
| Operand(PhysReg(257), v1)); |
| |
| /* Control flow merges: one branch shouldn't interfere with the other (should consider closest |
| * VALU writes after exec). |
| */ |
| //! p_unit_test 2 |
| //! s_cbranch_scc1 block:BB8 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u)); |
| bld.sopp(aco_opcode::s_cbranch_scc1, 8); |
| |
| //! BB7 |
| //! /* logical preds: / linear preds: BB6, / kind: */ |
| //! v1: %0:v[0] = v_mov_b32 0 |
| //! s2: %0:exec = s_mov_b64 -1 |
| //! v1: %0:v[1] = v_mov_b32 1 |
| //; for i in range(4): insert_pattern('v_nop') |
| //! s_branch block:BB9 |
| bld.reset(program->create_and_insert_block()); |
| program->blocks[6].linear_succs.push_back(7); |
| program->blocks[7].linear_preds.push_back(6); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1)); |
| for (unsigned i = 0; i < 4; i++) |
| bld.vop1(aco_opcode::v_nop); |
| bld.sopp(aco_opcode::s_branch, 9); |
| |
| //! BB8 |
| //! /* logical preds: / linear preds: BB6, / kind: */ |
| //! v1: %0:v[1] = v_mov_b32 1 |
| //; for i in range(5): insert_pattern('v_nop') |
| bld.reset(program->create_and_insert_block()); |
| program->blocks[6].linear_succs.push_back(8); |
| program->blocks[8].linear_preds.push_back(6); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1)); |
| for (unsigned i = 0; i < 5; i++) |
| bld.vop1(aco_opcode::v_nop); |
| |
| //! BB9 |
| //! /* logical preds: / linear preds: BB7, BB8, / kind: uniform, */ |
| //! s_waitcnt_depctr va_vdst(0) |
| //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1] |
| bld.reset(program->create_and_insert_block()); |
| program->blocks[7].linear_succs.push_back(9); |
| program->blocks[8].linear_succs.push_back(9); |
| program->blocks[9].linear_preds.push_back(7); |
| program->blocks[9].linear_preds.push_back(8); |
| bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1), |
| Operand(PhysReg(257), v1)); |
| |
| finish_insert_nops_test(); |
| END_TEST |
| |
| BEGIN_TEST(insert_nops.valu_mask_write) |
| if (!setup_cs(NULL, GFX11)) |
| return; |
| |
| /* Basic case. */ |
| //>> p_unit_test 0 |
| //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1] |
| //! s1: %0:s[1] = s_mov_b32 0 |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[2] = s_mov_b32 %0:s[1] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); |
| bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(), |
| Operand::zero(), Operand(PhysReg(0), s2)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1)); |
| |
| /* Mitigation. */ |
| //! p_unit_test 1 |
| //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1] |
| //! v1: %0:v[1] = v_mov_b32 %0:s[1] |
| //! s1: %0:s[1] = s_mov_b32 0 |
| //! s1: %0:s[2] = s_mov_b32 %0:s[1] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); |
| bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(), |
| Operand::zero(), Operand(PhysReg(0), s2)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(1), s1)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1)); |
| |
| //! p_unit_test 2 |
| //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1] |
| //! s1: %0:s[1] = s_mov_b32 0 |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[2] = s_mov_b32 %0:s[1] |
| //! s1: %0:s[2] = s_mov_b32 %0:s[1] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); |
| bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(), |
| Operand::zero(), Operand(PhysReg(0), s2)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1)); |
| |
| //! p_unit_test 3 |
| //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1] |
| //! s1: %0:s[1] = s_mov_b32 0 |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[2] = s_mov_b32 %0:s[1] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); |
| bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(), |
| Operand::zero(), Operand(PhysReg(0), s2)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero()); |
| bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1)); |
| |
| /* v_cndmask_b32 is both involved in the hazard and is a mitigation. */ |
| //! p_unit_test 4 |
| //! v1: %0:v[0] = v_cndmask_b32 %0:s[2], 0, %0:s[0-1] |
| //! s1: %0:s[1] = s_mov_b32 0 |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[2] = s_mov_b32 %0:s[1] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); |
| bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand(PhysReg(2), s1), |
| Operand::zero(), Operand(PhysReg(0), s2)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1)); |
| |
| /* VALU reading exec does not mitigate the hazard. We also don't consider literals. */ |
| //! p_unit_test 5 |
| //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1] |
| //! v1: %0:v[1] = v_mov_b32 %0:exec_lo |
| //! s1: %0:s[1] = s_mov_b32 0 |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[2] = s_mov_b32 %0:s[1] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); |
| bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(), |
| Operand::zero(), Operand(PhysReg(0), s2)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(exec_lo, s1)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1)); |
| |
| //! p_unit_test 6 |
| //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1] |
| //! v1: %0:v[1] = v_mov_b32 0x200 |
| //! s1: %0:s[1] = s_mov_b32 0 |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[2] = s_mov_b32 %0:s[1] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); |
| bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(), |
| Operand::zero(), Operand(PhysReg(0), s2)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::literal32(0x200)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1)); |
| |
| /* Basic case: VALU. */ |
| //! p_unit_test 7 |
| //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1] |
| //! s1: %0:s[1] = s_mov_b32 0 |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! v1: %0:v[1] = v_mov_b32 %0:s[1] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); |
| bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(), |
| Operand::zero(), Operand(PhysReg(0), s2)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero()); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(1), s1)); |
| |
| /* SALU which both reads and writes a lane mask SGPR. */ |
| //! p_unit_test 8 |
| //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1] |
| //! s1: %0:s[1] = s_mov_b32 0 |
| //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[2-3] |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[2] = s_mov_b32 %0:s[1] |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[4] = s_mov_b32 %0:s[2] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); |
| bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(), |
| Operand::zero(), Operand(PhysReg(0), s2)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero()); |
| bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(), |
| Operand::zero(), Operand(PhysReg(2), s2)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(2), s1)); |
| |
| /* When a SALU writes a lane mask, we shouldn't forget the current SGPRs used as lane masks then |
| * written. */ |
| //! p_unit_test 9 |
| //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1] |
| //! s1: %0:s[0] = s_mov_b32 0 |
| //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[2-3] |
| //! s1: %0:s[2] = s_mov_b32 0 |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[4] = s_mov_b32 %0:s[0] |
| //! s1: %0:s[5] = s_mov_b32 %0:s[2] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); |
| bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(), |
| Operand::zero(), Operand(PhysReg(0), s2)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero()); |
| bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(), |
| Operand::zero(), Operand(PhysReg(2), s2)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(0), s1)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(5), s1), Operand(PhysReg(2), s1)); |
| |
| /* When a SALU writes a lane mask, we shouldn't forget all SGPRs used as lane masks, there might |
| * be later problematic writes. */ |
| //! p_unit_test 10 |
| //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1] |
| //! s1: %0:s[0] = s_mov_b32 0 |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[4] = s_mov_b32 %0:s[0] |
| //! s1: %0:s[1] = s_mov_b32 0 |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[5] = s_mov_b32 %0:s[1] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); |
| bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(), |
| Operand::zero(), Operand(PhysReg(0), s2)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(0), s1)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(5), s1), Operand(PhysReg(1), s1)); |
| |
| //! p_unit_test 11 |
| //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1] |
| //! s1: %0:s[0] = s_mov_b32 0 |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[4] = s_mov_b32 %0:s[0] |
| //! s1: %0:s[0] = s_mov_b32 0 |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[5] = s_mov_b32 %0:s[0] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11)); |
| bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(), |
| Operand::zero(), Operand(PhysReg(0), s2)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(0), s1)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(5), s1), Operand(PhysReg(0), s1)); |
| |
| //! p_unit_test 12 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12)); |
| |
| //! BB1 |
| //! /* logical preds: / linear preds: BB0, / kind: */ |
| //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1] |
| bld.reset(program->create_and_insert_block()); |
| program->blocks[0].linear_succs.push_back(1); |
| program->blocks[1].linear_preds.push_back(0); |
| bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(), |
| Operand::zero(), Operand(PhysReg(0), s2)); |
| |
| //! BB2 |
| //! /* logical preds: / linear preds: BB0, / kind: */ |
| //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[2-3] |
| bld.reset(program->create_and_insert_block()); |
| program->blocks[0].linear_succs.push_back(2); |
| program->blocks[2].linear_preds.push_back(0); |
| bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(), |
| Operand::zero(), Operand(PhysReg(2), s2)); |
| |
| //! BB3 |
| //! /* logical preds: / linear preds: BB1, BB2, / kind: uniform, */ |
| //! s1: %0:s[0] = s_mov_b32 0 |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[4] = s_mov_b32 %0:s[0] |
| //! s1: %0:s[2] = s_mov_b32 0 |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[5] = s_mov_b32 %0:s[2] |
| bld.reset(program->create_and_insert_block()); |
| program->blocks[1].linear_succs.push_back(3); |
| program->blocks[2].linear_succs.push_back(3); |
| program->blocks[3].linear_preds.push_back(1); |
| program->blocks[3].linear_preds.push_back(2); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(0), s1)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand::zero()); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(5), s1), Operand(PhysReg(2), s1)); |
| |
| finish_insert_nops_test(); |
| END_TEST |
| |
| BEGIN_TEST(insert_nops.wmma_raw) |
| if (!setup_cs(NULL, GFX11)) |
| return; |
| |
| /* Basic case. */ |
| //>> p_unit_test 0 |
| //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[0-7].xx, %_:v[8-15].xx, %_:v[20-23].xx |
| //! v_nop |
| //! v4: %_:v[48-51] = v_wmma_f16_16x16x16_f16 %_:v[24-31].xx, %_:v[16-23].xx, %_:v[48-51].xx |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); |
| Operand A(PhysReg(256 + 0), v8); |
| Operand B(PhysReg(256 + 8), v8); |
| Operand C(PhysReg(256 + 20), v4); |
| bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0, |
| 0); |
| A.setFixed(PhysReg(256 + 24)); |
| B.setFixed(PhysReg(256 + 16)); |
| C.setFixed(PhysReg(256 + 48)); |
| bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0, |
| 0); |
| |
| /* Mitigation. */ |
| //! p_unit_test 1 |
| //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[0-7].xx, %_:v[8-15].xx, %_:v[20-23].xx |
| //! v1: %_:v[56] = v_rcp_f32 0 |
| //! v4: %_:v[48-51] = v_wmma_f16_16x16x16_f16 %_:v[24-31].xx, %_:v[16-23].xx, %_:v[48-51].xx |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); |
| A.setFixed(PhysReg(256 + 0)); |
| B.setFixed(PhysReg(256 + 8)); |
| C.setFixed(PhysReg(256 + 20)); |
| bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0, |
| 0); |
| bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256 + 56), v1), Operand::zero()); |
| A.setFixed(PhysReg(256 + 24)); |
| B.setFixed(PhysReg(256 + 16)); |
| C.setFixed(PhysReg(256 + 48)); |
| bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0, |
| 0); |
| |
| /* No hazard. */ |
| //>> p_unit_test 2 |
| //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[0-7].xx, %_:v[8-15].xx, %_:v[20-23].xx |
| //! v4: %_:v[48-51] = v_wmma_f16_16x16x16_f16 %_:v[24-31].xx, %_:v[32-39].xx, %_:v[48-51].xx |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); |
| A.setFixed(PhysReg(256 + 0)); |
| B.setFixed(PhysReg(256 + 8)); |
| C.setFixed(PhysReg(256 + 20)); |
| bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0, |
| 0); |
| A.setFixed(PhysReg(256 + 24)); |
| B.setFixed(PhysReg(256 + 32)); |
| C.setFixed(PhysReg(256 + 48)); |
| bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0, |
| 0); |
| |
| //>> p_unit_test 3 |
| //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[0-7].xx, %_:v[8-15].xx, %_:v[20-23].xx |
| //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[24-31].xx, %_:v[32-39].xx, %_:v[20-23].xx |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); |
| A.setFixed(PhysReg(256 + 0)); |
| B.setFixed(PhysReg(256 + 8)); |
| C.setFixed(PhysReg(256 + 20)); |
| bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0, |
| 0); |
| A.setFixed(PhysReg(256 + 24)); |
| B.setFixed(PhysReg(256 + 32)); |
| C.setFixed(PhysReg(256 + 20)); |
| bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0, |
| 0); |
| |
| finish_insert_nops_test(); |
| END_TEST |
| |
| enum StageInfoFlags { |
| stage_separate = 1 << 0, |
| stage_has_prolog = 1 << 1, |
| stage_has_export = 1 << 2, |
| stage_is_prolog = 1 << 3, |
| stage_is_epilog = 1 << 4, |
| }; |
| |
| struct StageInfo { |
| const char* name; |
| Stage stage; |
| unsigned flags; |
| }; |
| |
| BEGIN_TEST(insert_nops.export_priority.stages) |
| Stage geometry_ngg(AC_HW_NEXT_GEN_GEOMETRY_SHADER, SWStage::GS); |
| for (StageInfo stage : (StageInfo[]){ |
| {"_fs_first_last", fragment_fs, stage_has_export}, |
| {"_fs_with_epilog_first", fragment_fs, 0}, |
| {"_fs_prolog_first", fragment_fs, stage_is_prolog}, |
| {"_fs_epilog_last", fragment_fs, stage_is_epilog | stage_has_export}, |
| {"_vs_first_last", vertex_ngg, stage_has_export}, |
| {"_vs_with_prolog_last", vertex_ngg, stage_has_export | stage_has_prolog}, |
| {"_tes_first_last", tess_eval_ngg, stage_has_export}, |
| {"_ms_first_last", mesh_ngg, stage_has_export}, |
| {"_tesgs_first_last", tess_eval_geometry_ngg, stage_has_export}, |
| {"_vsgs_first_last", vertex_geometry_ngg, stage_has_export}, |
| {"_vsgs_with_prolog_last", vertex_geometry_ngg, stage_has_export | stage_has_prolog}, |
| {"_separate_vs_first", vertex_ngg, stage_separate}, |
| {"_separate_vs_with_prolog", vertex_ngg, stage_separate | stage_has_prolog}, |
| {"_separate_tes_first", tess_eval_ngg, stage_separate}, |
| {"_separate_gs_last", geometry_ngg, stage_separate | stage_has_export}}) { |
| if (!setup_cs(NULL, GFX11_5, CHIP_UNKNOWN, stage.name)) |
| continue; |
| |
| program->stage = stage.stage; |
| program->info.merged_shader_compiled_separately = stage.flags & stage_separate; |
| program->info.vs.has_prolog = stage.flags & stage_has_prolog; |
| program->is_prolog = stage.flags & stage_is_prolog; |
| program->is_epilog = stage.flags & stage_is_epilog; |
| //>> /* logical preds: / linear preds: / kind: uniform, top-level, */ |
| //~.*first.*! s_setprio imm:2 |
| if (stage.flags & stage_has_export) { |
| //~.*last.*! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0 |
| //~.*last.*! s_setprio imm:0 |
| //~.*last.*! s_nop |
| //~.*last.*! s_nop |
| //~.*last.*! s_endpgm |
| bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0, |
| V_008DFC_SQ_EXP_POS, false); |
| } else { |
| //(?!.*last.*)! v_nop |
| bld.vop1(aco_opcode::v_nop); |
| } |
| |
| finish_insert_nops_test(stage.flags & stage_has_export); |
| } |
| END_TEST |
| |
| BEGIN_TEST(insert_nops.export_priority.instrs_after_export) |
| if (!setup_cs(NULL, GFX11_5)) |
| return; |
| |
| program->stage = vertex_ngg; |
| //>> /* logical preds: / linear preds: / kind: uniform, top-level, */ |
| //! s_setprio imm:2 |
| //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0 |
| //! s_setprio imm:0 |
| //! s_waitcnt_expcnt %0:null imm:0 |
| //! s_nop |
| //! s_nop |
| //! s_setprio imm:2 |
| //! v_nop |
| //! s_endpgm |
| bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0, |
| V_008DFC_SQ_EXP_POS, false); |
| bld.vop1(aco_opcode::v_nop); |
| |
| finish_insert_nops_test(); |
| END_TEST |
| |
| BEGIN_TEST(insert_nops.export_priority.fallthrough_to_endpgm) |
| if (!setup_cs(NULL, GFX11_5)) |
| return; |
| |
| program->stage = vertex_ngg; |
| //>> /* logical preds: / linear preds: / kind: top-level, */ |
| //! s_setprio imm:2 |
| //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0 |
| //! s_setprio imm:0 |
| //! s_nop |
| //! s_nop |
| //>> BB1 |
| //>> /* logical preds: BB0, / linear preds: BB0, / kind: uniform, */ |
| //! s_endpgm |
| bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0, |
| V_008DFC_SQ_EXP_POS, false); |
| |
| bld.reset(program->create_and_insert_block()); |
| program->blocks[0].linear_succs.push_back(1); |
| program->blocks[0].logical_succs.push_back(1); |
| program->blocks[1].linear_preds.push_back(0); |
| program->blocks[1].logical_preds.push_back(0); |
| |
| finish_insert_nops_test(); |
| END_TEST |
| |
| BEGIN_TEST(insert_nops.export_priority.multiple_exports) |
| if (!setup_cs(NULL, GFX11_5)) |
| return; |
| |
| program->stage = vertex_ngg; |
| //>> /* logical preds: / linear preds: / kind: uniform, top-level, */ |
| //! s_setprio imm:2 |
| //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0 |
| //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos1 |
| //! s_setprio imm:0 |
| //! s_nop |
| //! s_nop |
| //! s_endpgm |
| bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0, |
| V_008DFC_SQ_EXP_POS, false); |
| bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0, |
| V_008DFC_SQ_EXP_POS + 1, false); |
| |
| finish_insert_nops_test(); |
| END_TEST |
| |
| BEGIN_TEST(insert_nops.export_priority.set_prio) |
| if (!setup_cs(NULL, GFX11_5)) |
| return; |
| |
| program->stage = vertex_ngg; |
| //>> /* logical preds: / linear preds: / kind: uniform, top-level, */ |
| //! s_setprio imm:3 |
| //! v_nop |
| //! s_setprio imm:2 |
| //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0 |
| //! s_setprio imm:0 |
| //! s_nop |
| //! s_nop |
| //! s_endpgm |
| bld.sopp(aco_opcode::s_setprio, 3); |
| bld.vop1(aco_opcode::v_nop); |
| bld.sopp(aco_opcode::s_setprio, 1); |
| bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0, |
| V_008DFC_SQ_EXP_POS, false); |
| |
| finish_insert_nops_test(); |
| END_TEST |
| |
| BEGIN_TEST(insert_nops.valu_read_sgpr.basic) |
| if (!setup_cs(NULL, GFX12)) |
| return; |
| |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(7), s1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(sgpr_null, s1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(exec_lo, s1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(m0, s1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(scc, s1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc, s1)); |
| |
| /* no hazard: SALU write missing */ |
| //>> p_unit_test 0 |
| //! s1: %0:s[64] = s_mov_b32 %0:s[4] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1)); |
| |
| /* no hazard: SGPR never read by VALU */ |
| //! p_unit_test 1 |
| //! s1: %0:s[16] = s_mov_b32 0 |
| //! s1: %0:s[64] = s_mov_b32 %0:s[16] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(16), s1), Operand::zero(4)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(16), s1)); |
| |
| /* basic case: SALU read */ |
| //! p_unit_test 2 |
| //! s1: %0:s[4] = s_mov_b32 0 |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[64] = s_mov_b32 %0:s[4] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1)); |
| |
| /* basic case again: VALU reads never expire */ |
| //! p_unit_test 3 |
| //! s1: %0:s[4] = s_mov_b32 0 |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[64] = s_mov_b32 %0:s[4] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1)); |
| |
| /* sa_sdst(0) resolves the hazard */ |
| //! p_unit_test 4 |
| //! s1: %0:s[4] = s_mov_b32 0 |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[64] = s_mov_b32 %0:s[4] |
| //! s1: %0:s[64] = s_mov_b32 %0:s[4] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1)); |
| |
| //! p_unit_test 5 |
| //! s1: %0:s[4] = s_mov_b32 0 |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[64] = s_mov_b32 %0:s[4] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); |
| bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1)); |
| |
| /* basic case: VALU read */ |
| //! p_unit_test 6 |
| //! s1: %0:s[4] = s_mov_b32 0 |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! v1: %0:v[0] = v_mov_b32 %0:s[4] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); |
| |
| /* the SALU write is in the same SGPR pair as the VALU read */ |
| //! p_unit_test 7 |
| //! s1: %0:s[6] = s_mov_b32 0 |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[64] = s_mov_b32 %0:s[6] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(6), s1), Operand::zero(4)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(6), s1)); |
| |
| /* no hazard: these registers are not problematic */ |
| //! p_unit_test 8 |
| //! s1: %0:null = s_mov_b32 0 |
| //! s1: %0:s[64] = s_mov_b32 %0:null |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(sgpr_null, s1), Operand::zero(4)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(sgpr_null, s1)); |
| |
| //! p_unit_test 9 |
| //! s1: %0:exec_lo = s_mov_b32 0 |
| //! s1: %0:s[64] = s_mov_b32 %0:exec_lo |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::zero(4)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(exec_lo, s1)); |
| |
| //! p_unit_test 10 |
| //! s1: %0:m0 = s_mov_b32 0 |
| //! s1: %0:s[64] = s_mov_b32 %0:m0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero(4)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(m0, s1)); |
| |
| //! p_unit_test 11 |
| //! s1: %0:scc = s_cmp_lg_i32 0, 0 |
| //! s1: %0:s[64] = s_mov_b32 %0:scc |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11)); |
| bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand::zero(4), Operand::zero(4)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(scc, s1)); |
| |
| /* 11 SALU between the write and a VALU read expire the hazard */ |
| //! p_unit_test 12 |
| //! s1: %0:s[4] = s_mov_b32 0 |
| //; for i in range(11): insert_pattern('s1: %0:s[64] = s_mov_b32 0') |
| //! v1: %0:v[0] = v_mov_b32 %0:s[4] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); |
| for (unsigned i = 0; i < 11; i++) |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); |
| |
| //! p_unit_test 13 |
| //! s1: %0:s[4] = s_mov_b32 0 |
| //; for i in range(10): insert_pattern('s1: %0:s[64] = s_mov_b32 0') |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! v1: %0:v[0] = v_mov_b32 %0:s[4] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); |
| for (unsigned i = 0; i < 10; i++) |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); |
| |
| /* 10 SALU between the write and a SALU read expire the hazard */ |
| //! p_unit_test 14 |
| //! s1: %0:s[4] = s_mov_b32 0 |
| //; for i in range(10): insert_pattern('s1: %0:s[64] = s_mov_b32 0') |
| //! s1: %0:s[64] = s_mov_b32 %0:s[4] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); |
| for (unsigned i = 0; i < 10; i++) |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1)); |
| |
| //! p_unit_test 15 |
| //! s1: %0:s[4] = s_mov_b32 0 |
| //; for i in range(9): insert_pattern('s1: %0:s[64] = s_mov_b32 0') |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[64] = s_mov_b32 %0:s[4] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); |
| for (unsigned i = 0; i < 9; i++) |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1)); |
| |
| /* SOPP in-between the write and the read do not count */ |
| //! p_unit_test 16 |
| //! s1: %0:s[4] = s_mov_b32 0 |
| //; for i in range(9): insert_pattern('s1: %0:s[64] = s_mov_b32 0') |
| //! s_nop |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[64] = s_mov_b32 %0:s[4] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(16)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); |
| for (unsigned i = 0; i < 9; i++) |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4)); |
| bld.sopp(aco_opcode::s_nop, 0); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1)); |
| |
| /* VALU -> VALU non-VCC SGPR */ |
| //! p_unit_test 17 |
| //! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0] |
| //! s_waitcnt_depctr va_sdst(0) |
| //! v1: %0:v[0] = v_mov_b32 %0:s[4] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(17)); |
| bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); |
| |
| /* VALU -> VALU VCC SGPR */ |
| //! p_unit_test 18 |
| //! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0] |
| //! s_waitcnt_depctr va_vcc(0) |
| //! v1: %0:v[0] = v_mov_b32 %0:vcc_hi |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(18)); |
| bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1)); |
| |
| /* va_sdst=0 from SALU reading an SGPR: hazard mitigated */ |
| //! p_unit_test 19 |
| //! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0] |
| //! s1: %0:s[64] = s_mov_b32 %0:s[6] |
| //! v1: %0:v[0] = v_mov_b32 %0:s[4] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(19)); |
| bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(6), s1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); |
| |
| /* va_vcc=0 from SALU reading VCC: hazard mitigated */ |
| //! p_unit_test 20 |
| //! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0] |
| //! s1: %0:s[64] = s_mov_b32 %0:vcc_lo |
| //! v1: %0:v[0] = v_mov_b32 %0:vcc_hi |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(20)); |
| bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(vcc, s1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1)); |
| |
| /* VALU -> VALU read VCC and then SGPR */ |
| //! p_unit_test 21 |
| //! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0] |
| //! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0] |
| //! s_waitcnt_depctr va_vcc(0) |
| //! v1: %0:v[0] = v_mov_b32 %0:vcc_hi |
| //! s_waitcnt_depctr va_sdst(0) |
| //! v1: %0:v[0] = v_mov_b32 %0:s[4] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(21)); |
| bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1)); |
| bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); |
| |
| /* VALU -> VALU read SGPR and then VCC */ |
| //! p_unit_test 22 |
| //! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0] |
| //! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0] |
| //! s_waitcnt_depctr va_sdst(0) |
| //! v1: %0:v[0] = v_mov_b32 %0:s[4] |
| //! s_waitcnt_depctr va_vcc(0) |
| //! v1: %0:v[0] = v_mov_b32 %0:vcc_hi |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(22)); |
| bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1)); |
| bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1)); |
| |
| /* VALU writes VCC and SALU writes SGPR */ |
| //! p_unit_test 23 |
| //! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0] |
| //! s1: %0:s[4] = s_mov_b32 0 |
| //! s_waitcnt_depctr va_vcc(0) |
| //! v1: %0:v[0] = v_mov_b32 %0:vcc_hi |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! v1: %0:v[0] = v_mov_b32 %0:s[4] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(23)); |
| bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); |
| |
| finish_insert_nops_test(); |
| END_TEST |
| |
| BEGIN_TEST(insert_nops.valu_read_sgpr.previous_part) |
| if (!setup_cs(NULL, GFX12)) |
| return; |
| |
| /* Raytracing shaders have a prolog and may also be split into several parts. */ |
| program->stage = raytracing_cs; |
| |
| /* Despite the SGPR never being read by a VALU in this shader, a sa_sdst(0) is needed. */ |
| //>> p_unit_test 0 |
| //! s1: %0:s[4] = s_mov_b32 0 |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s1: %0:s[64] = s_mov_b32 %0:s[4] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1)); |
| |
| finish_insert_nops_test(); |
| END_TEST |
| |
| BEGIN_TEST(insert_nops.setpc_gfx6) |
| if (!setup_cs(NULL, GFX6)) |
| return; |
| |
| /* SGPR->SMEM hazards */ |
| //>> p_unit_test 0 |
| //! s1: %0:s[0] = s_mov_b32 0 |
| //! s_nop imm:2 |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero()); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| //! p_unit_test 1 |
| //! s1: %0:s[0] = s_mov_b32 0 |
| //! s_nop imm:2 |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero()); |
| bld.sopp(aco_opcode::s_nop, 2); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| finish_insert_nops_test(); |
| |
| /* This hazard can't be tested using s_setpc_b64, because the s_setpc_b64 itself resolves it. */ |
| |
| /* VINTRP->v_readlane_b32/etc */ |
| //>> p_unit_test 2 |
| //! v1: %0:v[0] = v_interp_mov_f32 2, %0:m0 attr0.x |
| //! s_nop |
| create_program(GFX6, compute_cs, 64, CHIP_UNKNOWN); |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); |
| bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(PhysReg(256), v1), Operand::c32(2u), |
| Operand(m0, s1), 0, 0); |
| finish_insert_nops_test(false); |
| END_TEST |
| |
| BEGIN_TEST(insert_nops.setpc_gfx7) |
| for (amd_gfx_level gfx : {GFX7, GFX9}) { |
| if (!setup_cs(NULL, gfx)) |
| continue; |
| |
| //>> p_unit_test 0 |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| /* Break up SMEM clauses: resolved by the s_setpc_b64 itself */ |
| //! p_unit_test 1 |
| //! s1: %0:s[0] = s_load_dword %0:s[0-1] |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); |
| bld.smem(aco_opcode::s_load_dword, Definition(PhysReg(0), s1), Operand(PhysReg(0), s2)); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| /* SALU and GDS hazards */ |
| //! p_unit_test 2 |
| //! s_setreg_imm32_b32 0x0 imm:14337 |
| //! s_nop |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); |
| bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand::literal32(0), (7 << 11) | 1); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| /* VALU writes vcc -> vccz/v_div_fmas */ |
| //! p_unit_test 3 |
| //! s2: %0:vcc = v_cmp_eq_u32 0, 0 |
| //! s_nop imm:3 |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); |
| bld.vopc_e64(aco_opcode::v_cmp_eq_u32, Definition(vcc, s2), Operand::zero(), Operand::zero()); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| /* VALU writes exec -> execz/DPP */ |
| //! p_unit_test 4 |
| //! s2: %0:exec = v_cmpx_eq_u32 0, 0 |
| //! s_nop imm:3 |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); |
| bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), |
| Operand::zero()); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| /* VALU->DPP */ |
| //! p_unit_test 5 |
| //! v1: %0:v[0] = v_mov_b32 0 |
| //~gfx9! s_nop |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| /* VALU->v_readlane_b32/VMEM/etc */ |
| //! p_unit_test 6 |
| //! s1: %0:s[0] = v_readfirstlane_b32 %0:v[0] |
| //! s_nop imm:3 |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); |
| bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(0), s1), |
| Operand(PhysReg(256), v1)); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| finish_insert_nops_test(); |
| |
| /* These hazards can't be tested using s_setpc_b64, because the s_setpc_b64 itself resolves |
| * them. */ |
| |
| //>> p_unit_test 7 |
| //! buffer_store_dwordx3 %0:s[0-3], %0:v[0], 0, %0:v[0-2] offen |
| //! s_nop |
| create_program(gfx, compute_cs, 64, CHIP_UNKNOWN); |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); |
| bld.mubuf(aco_opcode::buffer_store_dwordx3, Operand(PhysReg(0), s4), |
| Operand(PhysReg(256), v1), Operand::zero(), Operand(PhysReg(256), v3), 0, true); |
| finish_insert_nops_test(false); |
| |
| //>> p_unit_test 8 |
| //! s1: %0:m0 = s_mov_b32 0 |
| //! s_nop |
| create_program(gfx, compute_cs, 64, CHIP_UNKNOWN); |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(m0), s1), Operand::zero()); |
| finish_insert_nops_test(false); |
| |
| /* Break up SMEM clauses */ |
| //>> p_unit_test 9 |
| //! s1: %0:s[0] = s_load_dword %0:s[0-1] |
| //! s_nop |
| create_program(gfx, compute_cs, 64, CHIP_UNKNOWN); |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); |
| bld.smem(aco_opcode::s_load_dword, Definition(PhysReg(0), s1), Operand(PhysReg(0), s2)); |
| finish_insert_nops_test(false); |
| } |
| END_TEST |
| |
| BEGIN_TEST(insert_nops.setpc_gfx10) |
| if (!setup_cs(NULL, GFX10)) |
| return; |
| |
| //>> p_unit_test 0 |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| /* VcmpxPermlaneHazard */ |
| //! p_unit_test 1 |
| //! s2: %0:exec = v_cmpx_eq_u32 0, 0 |
| //! v1: %0:v[0] = v_mov_b32 %0:v[0] |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); |
| bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), Operand::zero()); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| /* VMEMtoScalarWriteHazard */ |
| //! p_unit_test 2 |
| //! v1: %0:v[0] = ds_read_b32 %0:v[0] |
| //! s_waitcnt_vscnt %0:null imm:0 |
| //! s_waitcnt_depctr vm_vsrc(0) |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); |
| bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1)); |
| bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), |
| 0); /* reset LdsBranchVmemWARHazard */ |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| /* VcmpxExecWARHazard */ |
| //! p_unit_test 3 |
| //! s1: %0:s[0] = s_mov_b32 %0:exec_hi |
| //! s_waitcnt_depctr sa_sdst(0) |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand(exec_hi, s1)); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| /* LdsBranchVmemWARHazard */ |
| //! p_unit_test 4 |
| //! v1: %0:v[0] = ds_read_b32 %0:v[0] |
| //! v_nop |
| //! s_branch block:BB0 |
| //! s_waitcnt_vscnt %0:null imm:0 |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); |
| bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1)); |
| bld.vop1(aco_opcode::v_nop); /* reset VMEMtoScalarWriteHazard */ |
| bld.sopp(aco_opcode::s_branch, 0); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| //! p_unit_test 5 |
| //! v1: %0:v[0] = ds_read_b32 %0:v[0] |
| //! v_nop |
| //! s_waitcnt_vscnt %0:null imm:0 |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); |
| bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1)); |
| bld.vop1(aco_opcode::v_nop); /* reset VMEMtoScalarWriteHazard */ |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| /* waNsaCannotFollowWritelane: resolved by the s_setpc_b64 */ |
| //! p_unit_test 6 |
| //! v1: %0:v[0] = v_writelane_b32_e64 %0:v[1], 0, %0:v[0] |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); |
| bld.vop3(aco_opcode::v_writelane_b32_e64, Definition(PhysReg(256), v1), |
| Operand(PhysReg(257), v1), Operand::zero(4), Operand(PhysReg(256), v1)); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| finish_insert_nops_test(); |
| |
| /* These hazards can't be tested using s_setpc_b64, because the s_setpc_b64 itself resolves them. |
| */ |
| |
| /* SMEMtoVectorWriteHazard */ |
| //>> p_unit_test 7 |
| //! s1: %0:s[0] = s_load_dword %0:s[0-1] |
| //! s1: %0:null = s_mov_b32 0 |
| create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN); |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); |
| bld.smem(aco_opcode::s_load_dword, Definition(PhysReg(0), s1), Operand(PhysReg(0), s2)); |
| finish_insert_nops_test(false); |
| |
| /* NSAToVMEMBug is already resolved indirectly through VMEMtoScalarWriteHazard and |
| * LdsBranchVmemWARHazard. */ |
| //>> p_unit_test 8 |
| //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d |
| //! s_waitcnt_depctr vm_vsrc(0) |
| //! s_waitcnt_vscnt %0:null imm:0 |
| create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN); |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); |
| create_mimg(true, 6, 4); |
| finish_insert_nops_test(false); |
| |
| /* waNsaCannotFollowWritelane */ |
| //>> p_unit_test 9 |
| //! v1: %0:v[0] = v_writelane_b32_e64 %0:v[1], 0, %0:v[0] |
| //! s_nop |
| create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN); |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); |
| bld.vop3(aco_opcode::v_writelane_b32_e64, Definition(PhysReg(256), v1), |
| Operand(PhysReg(257), v1), Operand::zero(4), Operand(PhysReg(256), v1)); |
| finish_insert_nops_test(false); |
| |
| /* FPAtomicToDenormModeHazard */ |
| //>> p_unit_test 10 |
| //! flat_atomic_fmin %0:v[0-1], s1: undef, %0:v[0] |
| //! s_nop imm:2 |
| //! s_waitcnt_depctr vm_vsrc(0) |
| create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN); |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); |
| bld.flat(aco_opcode::flat_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1), |
| Operand(PhysReg(256), v1)); |
| finish_insert_nops_test(false); |
| END_TEST |
| |
| BEGIN_TEST(insert_nops.setpc_gfx11) |
| if (!setup_cs(NULL, GFX11)) |
| return; |
| |
| //>> p_unit_test 0 |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| /* LdsDirectVALUHazard */ |
| //! p_unit_test 1 |
| //! s2: %0:vcc = v_cmp_eq_u32 %0:v[0], 0 |
| //! s_waitcnt_depctr va_vdst(0) |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); |
| bld.vopc_e64(aco_opcode::v_cmp_eq_u32, Definition(vcc, s2), Operand(PhysReg(256), v1), |
| Operand::zero()); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| /* VALUPartialForwardingHazard */ |
| //! p_unit_test 2 |
| //! v1: %0:v[0] = v_mov_b32 0 |
| //! s_waitcnt_depctr va_vdst(0) |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| /* VcmpxPermlaneHazard */ |
| //! p_unit_test 2 |
| //! s2: %0:exec = v_cmpx_eq_u32 0, 0 |
| //! v_nop |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); |
| bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), Operand::zero()); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| /* VALUTransUseHazard */ |
| //! p_unit_test 3 |
| //! v1: %0:v[0] = v_rcp_f32 0 |
| //! s_waitcnt_depctr va_vdst(0) |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); |
| bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand::zero()); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| /* VALUMaskWriteHazard */ |
| //! p_unit_test 4 |
| //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc |
| //! s1: %0:vcc_hi = s_mov_b32 0 |
| //! s_waitcnt_depctr va_vdst(0) sa_sdst(0) |
| //! v1: %0:v[0] = v_xor3_b32 %0:v[0], %0:s[0], %0:s[0] |
| //! s_waitcnt_depctr va_vdst(0) |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); |
| bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(), |
| Operand::zero(), Operand(vcc, s2)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(vcc_hi, s1), Operand::c32(0)); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| //! p_unit_test 8 |
| //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc |
| //! s_waitcnt_depctr va_vdst(0) |
| //! v1: %0:v[0] = v_xor3_b32 %0:v[0], %0:s[0], %0:s[0] |
| //! s_waitcnt_depctr va_vdst(0) |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); |
| bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(), |
| Operand::zero(), Operand(vcc, s2)); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| //! p_unit_test 5 |
| //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc |
| //! s2: %0:vcc = s_mov_b64 0 |
| //! s_waitcnt_depctr va_vdst(0) sa_sdst(0) |
| //! v1: %0:v[0] = v_xor3_b32 %0:v[0], %0:s[0], %0:s[0] |
| //! s_waitcnt_depctr va_vdst(0) |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); |
| bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(), |
| Operand::zero(), Operand(vcc, s2)); |
| bld.sop1(aco_opcode::s_mov_b64, Definition(vcc, s2), Operand::zero(8)); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| /* LdsDirectVMEMHazard */ |
| //! p_unit_test 6 |
| //! v1: %0:v[0] = ds_read_b32 %0:v[0] |
| //! s_waitcnt_depctr vm_vsrc(0) |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); |
| bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1)); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| /* WMMA Hazards */ |
| //! p_unit_test 7 |
| //! v4: %0:v[20-23] = v_wmma_f16_16x16x16_f16 %0:v[0-7].xx, %0:v[8-15].xx, %0:v[20-23].xx |
| //! v_nop |
| //! s_waitcnt_depctr va_vdst(0) |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); |
| Operand A(PhysReg(256 + 0), v8); |
| Operand B(PhysReg(256 + 8), v8); |
| Operand C(PhysReg(256 + 20), v4); |
| bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0, |
| 0); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| finish_insert_nops_test(true); |
| END_TEST |
| |
| BEGIN_TEST(insert_nops.setpc_gfx12) |
| if (!setup_cs(NULL, GFX12)) |
| return; |
| |
| //>> p_unit_test 0 |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| /* LdsDirectVALUHazard */ |
| //! p_unit_test 1 |
| //! s2: %0:vcc = v_cmp_eq_u32 %0:v[0], 0 |
| //! s_waitcnt_depctr va_vdst(0) |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); |
| bld.vopc_e64(aco_opcode::v_cmp_eq_u32, Definition(vcc, s2), Operand(PhysReg(256), v1), |
| Operand::zero()); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| /* VcmpxPermlaneHazard */ |
| //! p_unit_test 2 |
| //! s2: %0:exec = v_cmpx_eq_u32 0, 0 |
| //! v_nop |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); |
| bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), Operand::zero()); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| /* LdsDirectVMEMHazard */ |
| //! p_unit_test 3 |
| //! v1: %0:v[0] = ds_read_b32 %0:v[0] |
| //! s_waitcnt_depctr vm_vsrc(0) |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); |
| bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1)); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| /* VALUReadSGPRHazard */ |
| //! p_unit_test 4 |
| //! v1: %0:v[0] = v_mov_b32 %0:s[4] |
| //! s1: %0:s[4] = s_mov_b32 0 |
| //! s_waitcnt_depctr va_vdst(0) sa_sdst(0) |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| //! p_unit_test 5 |
| //! v1: %0:v[0] = v_mov_b32 %0:s[4] |
| //! s1: %0:s[4] = s_mov_b32 0 |
| //; for i in range(10): insert_pattern('s1: %0:s[64] = s_mov_b32 0') |
| //! s_waitcnt_depctr va_vdst(0) |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); |
| for (unsigned i = 0; i < 10; i++) /* the s_setpc_b64 counts */ |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4)); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| //! p_unit_test 6 |
| //! v1: %0:v[0] = v_mov_b32 %0:s[4] |
| //! s1: %0:s[4] = s_mov_b32 0 |
| //; for i in range(9): insert_pattern('s1: %0:s[64] = s_mov_b32 0') |
| //! s_waitcnt_depctr va_vdst(0) sa_sdst(0) |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); |
| for (unsigned i = 0; i < 9; i++) |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4)); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| //! p_unit_test 7 |
| //! v1: %0:v[0] = v_mov_b32 %0:s[4] |
| //! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0] |
| //! s_waitcnt_depctr va_vdst(0) va_sdst(0) |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); |
| bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1)); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| //! p_unit_test 8 |
| //! v1: %0:v[0] = v_mov_b32 %0:vcc_lo |
| //! s1: %0:vcc_lo = v_readfirstlane_b32 %0:v[0] |
| //! s_waitcnt_depctr va_vdst(0) va_vcc(0) |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(vcc), s1)); |
| bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc, s1), Operand(PhysReg(256), v1)); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| //! p_unit_test 9 |
| //! v1: %0:v[0] = v_mov_b32 %0:s[4] |
| //! v1: %0:v[2] = v_mov_b32 %0:vcc_lo |
| //! s1: %0:vcc_lo = v_readfirstlane_b32 %0:v[1] |
| //! s1: %0:s[4] = s_mov_b32 0 |
| //! s_waitcnt_depctr va_vdst(0) va_vcc(0) sa_sdst(0) |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand(PhysReg(vcc), s1)); |
| bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc, s1), Operand(PhysReg(257), v1)); |
| bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| //! p_unit_test 10 |
| //! v1: %0:v[1] = v_mov_b32 %0:s[5] |
| //! v1: %0:v[2] = v_mov_b32 %0:vcc_lo |
| //! s1: %0:s[5] = v_readfirstlane_b32 %0:v[0] |
| //! s1: %0:vcc_lo = v_readfirstlane_b32 %0:v[1] |
| //! s_waitcnt_depctr va_vdst(0) va_sdst(0) va_vcc(0) |
| //! s_setpc_b64 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(5), s1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand(PhysReg(vcc), s1)); |
| bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(5), s1), Operand(PhysReg(256), v1)); |
| bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc, s1), Operand(PhysReg(257), v1)); |
| bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); |
| |
| finish_insert_nops_test(true); |
| END_TEST |
| |
| BEGIN_TEST(insert_nops.fpatomic_to_denorm_mode) |
| for (amd_gfx_level lvl : {GFX10, GFX10_3}) { |
| if (!setup_cs(NULL, lvl)) |
| continue; |
| |
| //>> p_unit_test 0 |
| //! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0] |
| //! s_nop imm:2 |
| //! s_denorm_mode imm:42 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); |
| bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1), |
| Operand(PhysReg(256), v1)); |
| bld.sopp(aco_opcode::s_denorm_mode, 42); |
| |
| //! p_unit_test 1 |
| //! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0] |
| //! s_nop |
| //! s_nop imm:1 |
| //! s_denorm_mode imm:42 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); |
| bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1), |
| Operand(PhysReg(256), v1)); |
| bld.sopp(aco_opcode::s_nop, 0); |
| bld.sopp(aco_opcode::s_denorm_mode, 42); |
| |
| // VALU, waitcnt or enough wait states mitigates the hazard |
| //! p_unit_test 2 |
| //! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0] |
| //! v_nop |
| //! s_denorm_mode imm:42 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); |
| bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1), |
| Operand(PhysReg(256), v1)); |
| bld.vop1(aco_opcode::v_nop); |
| bld.sopp(aco_opcode::s_denorm_mode, 42); |
| |
| //! p_unit_test 3 |
| //! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0] |
| //! s_waitcnt expcnt(0) lgkmcnt(0) vmcnt(0) |
| //! s_denorm_mode imm:42 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); |
| bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1), |
| Operand(PhysReg(256), v1)); |
| bld.sopp(aco_opcode::s_waitcnt, 0); |
| bld.sopp(aco_opcode::s_denorm_mode, 42); |
| |
| //! p_unit_test 4 |
| //! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0] |
| //! s_nop imm:2 |
| //! s_denorm_mode imm:42 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); |
| bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1), |
| Operand(PhysReg(256), v1)); |
| bld.sopp(aco_opcode::s_nop, 2); |
| bld.sopp(aco_opcode::s_denorm_mode, 42); |
| |
| //! p_unit_test 5 |
| //! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0] |
| //! s_nop |
| //! s_nop |
| //! s_nop |
| //! s_denorm_mode imm:42 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); |
| bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1), |
| Operand(PhysReg(256), v1)); |
| bld.sopp(aco_opcode::s_nop, 0); |
| bld.sopp(aco_opcode::s_nop, 0); |
| bld.sopp(aco_opcode::s_nop, 0); |
| bld.sopp(aco_opcode::s_denorm_mode, 42); |
| |
| finish_insert_nops_test(); |
| } |
| END_TEST |