| /* |
| * Copyright 2024 Valve Corporation |
| * SPDX-License-Identifier: MIT |
| */ |
| |
| #include "helpers.h" |
| |
| using namespace aco; |
| |
| BEGIN_TEST(vopd_sched.commutative) |
| if (!setup_cs(NULL, GFX11, CHIP_UNKNOWN, "", 32)) |
| return; |
| |
| PhysReg reg_v0{256}; |
| PhysReg reg_v1{257}; |
| PhysReg reg_v2{258}; |
| PhysReg reg_v3{259}; |
| |
| //>> p_unit_test 0 |
| //! v1: %0:v[1] = v_dual_add_f32 %0:v[3], %0:v[2] :: v1: %0:v[0] = v_dual_mul_f32 %0:v[2], %0:v[3] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); |
| bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v0, v1), Operand(reg_v2, v1), |
| Operand(reg_v3, v1)); |
| bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v2, v1), |
| Operand(reg_v3, v1)); |
| |
| /* Neither of these opcodes are commutative. */ |
| bld.reset(program->create_and_insert_block()); |
| //>> p_unit_test 1 |
| //! v1: %0:v[0] = v_fmamk_f32 %0:v[2], %0:v[3], 0 |
| //! v1: %0:v[1] = v_lshlrev_b32 %0:v[2], %0:v[3] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); |
| bld.vop2(aco_opcode::v_fmamk_f32, Definition(reg_v0, v1), Operand(reg_v2, v1), |
| Operand(reg_v3, v1), Operand::zero()); |
| bld.vop2(aco_opcode::v_lshlrev_b32, Definition(reg_v1, v1), Operand(reg_v2, v1), |
| Operand(reg_v3, v1)); |
| |
| /* We have to change the opcode for subtractions. */ |
| bld.reset(program->create_and_insert_block()); |
| //>> p_unit_test 2 |
| //! v1: %0:v[1] = v_dual_subrev_f32 %0:v[3], %0:v[2] :: v1: %0:v[0] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); |
| bld.vop2(aco_opcode::v_fmamk_f32, Definition(reg_v0, v1), Operand(reg_v2, v1), |
| Operand(reg_v3, v1), Operand::zero()); |
| bld.vop2(aco_opcode::v_sub_f32, Definition(reg_v1, v1), Operand(reg_v2, v1), |
| Operand(reg_v3, v1)); |
| |
| bld.reset(program->create_and_insert_block()); |
| //>> p_unit_test 3 |
| //! v1: %0:v[0] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0 :: v1: %0:v[1] = v_dual_sub_f32 %0:v[3], %0:v[2] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); |
| bld.vop2(aco_opcode::v_subrev_f32, Definition(reg_v1, v1), Operand(reg_v2, v1), |
| Operand(reg_v3, v1)); |
| bld.vop2(aco_opcode::v_fmamk_f32, Definition(reg_v0, v1), Operand(reg_v2, v1), |
| Operand(reg_v3, v1), Operand::zero()); |
| |
| /* If we have to move the second instruction into OPY instead of OPX, then swapping must still be |
| * correct. */ |
| bld.reset(program->create_and_insert_block()); |
| //>> p_unit_test 4 |
| //! v1: %0:v[0] = v_dual_mul_f32 %0:v[3], %0:v[2] :: v1: %0:v[1] = v_dual_lshlrev_b32 %0:v[2], %0:v[3] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); |
| bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v0, v1), Operand(reg_v2, v1), |
| Operand(reg_v3, v1)); |
| bld.vop2(aco_opcode::v_lshlrev_b32, Definition(reg_v1, v1), Operand(reg_v2, v1), |
| Operand(reg_v3, v1)); |
| |
| bld.reset(program->create_and_insert_block()); |
| //>> p_unit_test 5 |
| //! v1: %0:v[0] = v_dual_mul_f32 %0:v[3], %0:v[2] :: v1: %0:v[1] = v_dual_lshlrev_b32 %0:v[2], %0:v[3] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); |
| bld.vop2(aco_opcode::v_lshlrev_b32, Definition(reg_v1, v1), Operand(reg_v2, v1), |
| Operand(reg_v3, v1)); |
| bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v0, v1), Operand(reg_v2, v1), |
| Operand(reg_v3, v1)); |
| |
| finish_schedule_vopd_test(); |
| END_TEST |
| |
| BEGIN_TEST(vopd_sched.mov_to_add_bfrev) |
| for (amd_gfx_level gfx : {GFX11, GFX12}) { |
| if (!setup_cs(NULL, gfx, CHIP_UNKNOWN, "", 32)) |
| continue; |
| |
| PhysReg reg_v0{256}; |
| PhysReg reg_v1{257}; |
| PhysReg reg_v2{258}; |
| PhysReg reg_v3{259}; |
| |
| //>> p_unit_test 0 |
| //~gfx11! v1: %0:v[1] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[0] = v_dual_add_nc_u32 0, %0:v[2] |
| //~gfx12! v1: %0:v[1] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[0] = v_dual_mov_b32 %0:v[2] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v0, v1), Operand(reg_v2, v1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v1, v1), Operand(reg_v2, v1)); |
| |
| /* We can't turn the v_mov_b32 into a v_add_u32 because then both instructions would be |
| * OPY-only. |
| */ |
| bld.reset(program->create_and_insert_block()); |
| //>> p_unit_test 1 |
| //~gfx11! v1: %0:v[0] = v_mov_b32 %0:v[2] |
| //~gfx11! v1: %0:v[1] = v_lshlrev_b32 %0:v[2], %0:v[3] |
| //~gfx12! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_lshlrev_b32 %0:v[2], %0:v[3] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v0, v1), Operand(reg_v2, v1)); |
| bld.vop2(aco_opcode::v_lshlrev_b32, Definition(reg_v1, v1), Operand(reg_v2, v1), |
| Operand(reg_v3, v1)); |
| |
| bld.reset(program->create_and_insert_block()); |
| //>> p_unit_test 2 |
| //~gfx11! v1: %0:v[1] = v_lshlrev_b32 %0:v[2], %0:v[3] |
| //~gfx11! v1: %0:v[0] = v_mov_b32 %0:v[2] |
| //~gfx12! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_lshlrev_b32 %0:v[2], %0:v[3] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); |
| bld.vop2(aco_opcode::v_lshlrev_b32, Definition(reg_v1, v1), Operand(reg_v2, v1), |
| Operand(reg_v3, v1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v0, v1), Operand(reg_v2, v1)); |
| |
| bld.reset(program->create_and_insert_block()); |
| //>> p_unit_test 3 |
| //~gfx11! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_and_b32 %0:v[3], %0:v[2] |
| //~gfx12! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_and_b32 %0:v[2], %0:v[3] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v0, v1), Operand(reg_v2, v1)); |
| bld.vop2(aco_opcode::v_and_b32, Definition(reg_v1, v1), Operand(reg_v2, v1), |
| Operand(reg_v3, v1)); |
| |
| bld.reset(program->create_and_insert_block()); |
| //>> p_unit_test 4 |
| //~gfx11! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_and_b32 %0:v[3], %0:v[2] |
| //~gfx12! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_and_b32 %0:v[2], %0:v[3] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); |
| bld.vop2(aco_opcode::v_and_b32, Definition(reg_v1, v1), Operand(reg_v2, v1), |
| Operand(reg_v3, v1)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v0, v1), Operand(reg_v2, v1)); |
| |
| /* The v_add_u32 should be OPY, not OPX. */ |
| bld.reset(program->create_and_insert_block()); |
| //>> p_unit_test 5 |
| //~gfx11! v1: %0:v[1] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0 :: v1: %0:v[0] = v_dual_add_nc_u32 0, %0:v[2] |
| //~gfx12! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); |
| bld.vop2(aco_opcode::v_fmamk_f32, Definition(reg_v1, v1), Operand(reg_v2, v1), |
| Operand(reg_v3, v1), Operand::zero()); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v0, v1), Operand(reg_v2, v1)); |
| |
| bld.reset(program->create_and_insert_block()); |
| //>> p_unit_test 6 |
| //~gfx11! v1: %0:v[1] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0 :: v1: %0:v[0] = v_dual_add_nc_u32 0, %0:v[2] |
| //~gfx12! v1: %0:v[1] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0 :: v1: %0:v[0] = v_dual_mov_b32 %0:v[2] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v0, v1), Operand(reg_v2, v1)); |
| bld.vop2(aco_opcode::v_fmamk_f32, Definition(reg_v1, v1), Operand(reg_v2, v1), |
| Operand(reg_v3, v1), Operand::zero()); |
| |
| //>> p_unit_test 7 |
| //! v1: %0:v[1] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[0] = v_dual_mov_b32 0x3c000000 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); |
| bld.vop1(aco_opcode::v_bfrev_b32, Definition(reg_v0, v1), Operand::c32(60)); |
| bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v1, v1), Operand(reg_v2, v1)); |
| |
| finish_schedule_vopd_test(); |
| } |
| END_TEST |
| |
| BEGIN_TEST(vopd_sched.war) |
| for (amd_gfx_level gfx : {GFX11, GFX12}) { |
| if (!setup_cs(NULL, gfx, CHIP_UNKNOWN, "", 32)) |
| continue; |
| |
| PhysReg reg_v0{256}; |
| PhysReg reg_v1{257}; |
| PhysReg reg_v3{259}; |
| |
| //>> p_unit_test 0 |
| //~gfx11! v1: %0:v[1] = v_dual_add_f32 %0:v[3], %0:v[1] :: v1: %0:v[0] = v_dual_mul_f32 %0:v[1], %0:v[3] |
| //~gfx12! v1: %0:v[0] = v_dual_mul_f32 %0:v[1], %0:v[3] :: v1: %0:v[1] = v_dual_add_f32 %0:v[3], %0:v[1] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); |
| bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v0, v1), Operand(reg_v1, v1), |
| Operand(reg_v3, v1)); |
| bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v3, v1), |
| Operand(reg_v1, v1)); |
| |
| /* We can't use OPX for the v_mul_f32 because of the WaR, but we also can't use OPX for the |
| * v_add_u32 because that opcode is OPY-only. */ |
| //>> p_unit_test 1 |
| //~gfx11! v1: %0:v[1] = v_dual_mul_f32 %0:v[3], %0:v[1] :: v1: %0:v[0] = v_dual_add_nc_u32 %0:v[1], %0:v[3] |
| //~gfx12! v1: %0:v[0] = v_add_u32 %0:v[1], %0:v[3] |
| //~gfx12! v1: %0:v[1] = v_mul_f32 %0:v[3], %0:v[1] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); |
| bld.vop2(aco_opcode::v_add_u32, Definition(reg_v0, v1), Operand(reg_v1, v1), |
| Operand(reg_v3, v1)); |
| bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v1, v1), Operand(reg_v3, v1), |
| Operand(reg_v1, v1)); |
| |
| finish_schedule_vopd_test(); |
| } |
| END_TEST |
| |
| BEGIN_TEST(vopd_sched.same_vgpr) |
| for (amd_gfx_level gfx : {GFX11, GFX12}) { |
| if (!setup_cs(NULL, gfx, CHIP_UNKNOWN, "", 32)) |
| continue; |
| |
| PhysReg reg_v0{256}; |
| PhysReg reg_v1{257}; |
| PhysReg reg_v2{258}; |
| PhysReg reg_v3{259}; |
| |
| //>> p_unit_test 0 |
| //~gfx11! v1: %0:v[1] = v_dual_add_f32 %0:v[1], %0:v[2] :: v1: %0:v[0] = v_dual_add_f32 %0:v[2], %0:v[0] |
| //~gfx12! v1: %0:v[1] = v_dual_add_f32 %0:v[2], %0:v[1] :: v1: %0:v[0] = v_dual_add_f32 %0:v[2], %0:v[0] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); |
| bld.vop2(aco_opcode::v_add_f32, Definition(reg_v0, v1), Operand(reg_v2, v1), |
| Operand(reg_v0, v1)); |
| bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v2, v1), |
| Operand(reg_v1, v1)); |
| |
| //>> p_unit_test 1 |
| //~gfx11! v1: %0:v[1] = v_dual_add_f32 %0:v[2], %0:v[1] :: v1: %0:v[0] = v_dual_add_f32 %0:v[0], %0:v[2] |
| //~gfx12! v1: %0:v[1] = v_dual_add_f32 %0:v[1], %0:v[2] :: v1: %0:v[0] = v_dual_add_f32 %0:v[0], %0:v[2] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); |
| bld.vop2(aco_opcode::v_add_f32, Definition(reg_v0, v1), Operand(reg_v0, v1), |
| Operand(reg_v2, v1)); |
| bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v1, v1), |
| Operand(reg_v2, v1)); |
| |
| //>> p_unit_test 2 |
| //~gfx11! v1: %0:v[1] = v_dual_add_f32 %0:v[3], %0:v[2] :: v1: %0:v[0] = v_dual_add_f32 %0:v[2], %0:v[3] |
| //~gfx12! v1: %0:v[1] = v_dual_add_f32 %0:v[2], %0:v[3] :: v1: %0:v[0] = v_dual_add_f32 %0:v[2], %0:v[3] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); |
| bld.vop2(aco_opcode::v_add_f32, Definition(reg_v0, v1), Operand(reg_v2, v1), |
| Operand(reg_v3, v1)); |
| bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v2, v1), |
| Operand(reg_v3, v1)); |
| |
| //>> p_unit_test 3 |
| //~gfx11! v1: %0:v[0] = v_add_f32 %0:v[2], %0:v[2] |
| //~gfx11! v1: %0:v[1] = v_add_f32 %0:v[2], %0:v[2] |
| //~gfx12! v1: %0:v[1] = v_dual_add_f32 %0:v[2], %0:v[2] :: v1: %0:v[0] = v_dual_add_f32 %0:v[2], %0:v[2] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); |
| bld.vop2(aco_opcode::v_add_f32, Definition(reg_v0, v1), Operand(reg_v2, v1), |
| Operand(reg_v2, v1)); |
| bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v2, v1), |
| Operand(reg_v2, v1)); |
| |
| /* the sources can't be swapped because src0 is an SGPR */ |
| //>> p_unit_test 4 |
| //~gfx11! v1: %0:v[1] = v_mul_f32 %0:s[1], %0:v[2] |
| //~gfx11! v1: %0:v[0] = v_mul_f32 %0:s[2], %0:v[2] |
| //~gfx12! v1: %0:v[0] = v_dual_mul_f32 %0:s[2], %0:v[2] :: v1: %0:v[1] = v_dual_mul_f32 %0:s[1], %0:v[2] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); |
| bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v1, v1), Operand(PhysReg(1), s1), |
| Operand(reg_v2, v1)); |
| bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v0, v1), Operand(PhysReg(2), s1), |
| Operand(reg_v2, v1)); |
| |
| /* fmamk uses src2 for the second source, which doesn't allow the same VGPR */ |
| //>> p_unit_test 5 |
| //! v1: %0:v[0] = v_fmamk_f32 %0:v[0], %0:v[2], 0x80 |
| //! v1: %0:v[1] = v_fmamk_f32 %0:v[1], %0:v[2], 0x80 |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); |
| bld.vop2(aco_opcode::v_fmamk_f32, Definition(reg_v0, v1), Operand(reg_v0, v1), |
| Operand(reg_v2, v1), Operand::literal32(128)); |
| bld.vop2(aco_opcode::v_fmamk_f32, Definition(reg_v1, v1), Operand(reg_v1, v1), |
| Operand(reg_v2, v1), Operand::literal32(128)); |
| |
| /* the two sources have to be the same size */ |
| //>> p_unit_test 6 |
| //! v1: %0:v[0] = v_add_f32 %0:v[2], %0:v[2] |
| //! v1: %0:v[1] = v_dot2c_f32_f16 %0:v[2], %0:v[2] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); |
| bld.vop2(aco_opcode::v_add_f32, Definition(reg_v0, v1), Operand(reg_v2, v1), |
| Operand(reg_v2, v1)); |
| bld.vop2(aco_opcode::v_dot2c_f32_f16, Definition(reg_v1, v1), Operand(reg_v2, v1), |
| Operand(reg_v2, v1)); |
| |
| //>> p_unit_test 7 |
| //~gfx11! v1: %0:v[0] = v_dot2c_f32_f16 %0:v[2], %0:v[2], %0:v[0] |
| //~gfx11! v1: %0:v[1] = v_dot2c_f32_f16 %0:v[2], %0:v[2], %0:v[1] |
| //~gfx12! v1: %0:v[1] = v_dual_dot2acc_f32_f16 %0:v[2], %0:v[2], %0:v[1] :: v1: %0:v[0] = v_dual_dot2acc_f32_f16 %0:v[2], %0:v[2], %0:v[0] |
| bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); |
| bld.vop2(aco_opcode::v_dot2c_f32_f16, Definition(reg_v0, v1), Operand(reg_v2, v1), |
| Operand(reg_v2, v1), Operand(reg_v0, v1)); |
| bld.vop2(aco_opcode::v_dot2c_f32_f16, Definition(reg_v1, v1), Operand(reg_v2, v1), |
| Operand(reg_v2, v1), Operand(reg_v1, v1)); |
| |
| finish_schedule_vopd_test(); |
| } |
| END_TEST |