| /* |
| * Copyright © 2018 Valve Corporation |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| * |
| */ |
| |
| #ifndef ACO_IR_H |
| #define ACO_IR_H |
| |
| #include "aco_opcodes.h" |
| #include "aco_util.h" |
| #include "aco_interface.h" |
| #include "aco_shader_info.h" |
| #include "vulkan/radv_shader.h" |
| |
| #include "nir.h" |
| |
| #include <algorithm> |
| #include <bitset> |
| #include <memory> |
| #include <vector> |
| |
| struct radv_shader_args; |
| |
| namespace aco { |
| |
| extern uint64_t debug_flags; |
| |
| enum { |
| DEBUG_VALIDATE_IR = 0x1, |
| DEBUG_VALIDATE_RA = 0x2, |
| DEBUG_PERFWARN = 0x4, |
| DEBUG_FORCE_WAITCNT = 0x8, |
| DEBUG_NO_VN = 0x10, |
| DEBUG_NO_OPT = 0x20, |
| DEBUG_NO_SCHED = 0x40, |
| DEBUG_PERF_INFO = 0x80, |
| DEBUG_LIVE_INFO = 0x100, |
| DEBUG_FORCE_WAITDEPS = 0x200, |
| DEBUG_NO_VALIDATE_IR = 0x400, |
| }; |
| |
| /** |
| * Representation of the instruction's microcode encoding format |
| * Note: Some Vector ALU Formats can be combined, such that: |
| * - VOP2* | VOP3 represents a VOP2 instruction in VOP3 encoding |
| * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive. |
| * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing. |
| * |
| * (*) The same is applicable for VOP1 and VOPC instructions. |
| */ |
| enum class Format : std::uint16_t { |
| /* Pseudo Instruction Format */ |
| PSEUDO = 0, |
| /* Scalar ALU & Control Formats */ |
| SOP1 = 1, |
| SOP2 = 2, |
| SOPK = 3, |
| SOPP = 4, |
| SOPC = 5, |
| /* Scalar Memory Format */ |
| SMEM = 6, |
| /* LDS/GDS Format */ |
| DS = 8, |
| LDSDIR = 9, |
| /* Vector Memory Buffer Formats */ |
| MTBUF = 10, |
| MUBUF = 11, |
| /* Vector Memory Image Format */ |
| MIMG = 12, |
| /* Export Format */ |
| EXP = 13, |
| /* Flat Formats */ |
| FLAT = 14, |
| GLOBAL = 15, |
| SCRATCH = 16, |
| |
| PSEUDO_BRANCH = 17, |
| PSEUDO_BARRIER = 18, |
| PSEUDO_REDUCTION = 19, |
| |
| /* Vector ALU Formats */ |
| VOP3P = 20, |
| VINTERP_INREG = 21, |
| VOP1 = 1 << 8, |
| VOP2 = 1 << 9, |
| VOPC = 1 << 10, |
| VOP3 = 1 << 11, |
| /* Vector Parameter Interpolation Format */ |
| VINTRP = 1 << 12, |
| DPP16 = 1 << 13, |
| SDWA = 1 << 14, |
| DPP8 = 1 << 15, |
| }; |
| |
| enum class instr_class : uint8_t { |
| valu32 = 0, |
| valu_convert32 = 1, |
| valu64 = 2, |
| valu_quarter_rate32 = 3, |
| valu_fma = 4, |
| valu_transcendental32 = 5, |
| valu_double = 6, |
| valu_double_add = 7, |
| valu_double_convert = 8, |
| valu_double_transcendental = 9, |
| salu = 10, |
| smem = 11, |
| barrier = 12, |
| branch = 13, |
| sendmsg = 14, |
| ds = 15, |
| exp = 16, |
| vmem = 17, |
| waitcnt = 18, |
| other = 19, |
| count, |
| }; |
| |
| enum storage_class : uint8_t { |
| storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */ |
| storage_buffer = 0x1, /* SSBOs and global memory */ |
| storage_gds = 0x2, |
| storage_image = 0x4, |
| storage_shared = 0x8, /* or TCS output */ |
| storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */ |
| storage_task_payload = 0x20,/* Task-Mesh payload */ |
| storage_scratch = 0x40, |
| storage_vgpr_spill = 0x80, |
| storage_count = 8, /* not counting storage_none */ |
| }; |
| |
| enum memory_semantics : uint8_t { |
| semantic_none = 0x0, |
| /* for loads: don't move any access after this load to before this load (even other loads) |
| * for barriers: don't move any access after the barrier to before any |
| * atomics/control_barriers/sendmsg_gs_done before the barrier */ |
| semantic_acquire = 0x1, |
| /* for stores: don't move any access before this store to after this store |
| * for barriers: don't move any access before the barrier to after any |
| * atomics/control_barriers/sendmsg_gs_done after the barrier */ |
| semantic_release = 0x2, |
| |
| /* the rest are for load/stores/atomics only */ |
| /* cannot be DCE'd or CSE'd */ |
| semantic_volatile = 0x4, |
| /* does not interact with barriers and assumes this lane is the only lane |
| * accessing this memory */ |
| semantic_private = 0x8, |
| /* this operation can be reordered around operations of the same storage. |
| * says nothing about barriers */ |
| semantic_can_reorder = 0x10, |
| /* this is a atomic instruction (may only read or write memory) */ |
| semantic_atomic = 0x20, |
| /* this is instruction both reads and writes memory */ |
| semantic_rmw = 0x40, |
| |
| semantic_acqrel = semantic_acquire | semantic_release, |
| semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw, |
| }; |
| |
| enum sync_scope : uint8_t { |
| scope_invocation = 0, |
| scope_subgroup = 1, |
| scope_workgroup = 2, |
| scope_queuefamily = 3, |
| scope_device = 4, |
| }; |
| |
| struct memory_sync_info { |
| memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {} |
| memory_sync_info(int storage_, int semantics_ = 0, sync_scope scope_ = scope_invocation) |
| : storage((storage_class)storage_), semantics((memory_semantics)semantics_), scope(scope_) |
| {} |
| |
| storage_class storage : 8; |
| memory_semantics semantics : 8; |
| sync_scope scope : 8; |
| |
| bool operator==(const memory_sync_info& rhs) const |
| { |
| return storage == rhs.storage && semantics == rhs.semantics && scope == rhs.scope; |
| } |
| |
| bool can_reorder() const |
| { |
| if (semantics & semantic_acqrel) |
| return false; |
| /* Also check storage so that zero-initialized memory_sync_info can be |
| * reordered. */ |
| return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile); |
| } |
| }; |
| static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding"); |
| |
| enum fp_round { |
| fp_round_ne = 0, |
| fp_round_pi = 1, |
| fp_round_ni = 2, |
| fp_round_tz = 3, |
| }; |
| |
| enum fp_denorm { |
| /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and |
| * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */ |
| fp_denorm_flush = 0x0, |
| fp_denorm_keep_in = 0x1, |
| fp_denorm_keep_out = 0x2, |
| fp_denorm_keep = 0x3, |
| }; |
| |
| struct float_mode { |
| /* matches encoding of the MODE register */ |
| union { |
| struct { |
| fp_round round32 : 2; |
| fp_round round16_64 : 2; |
| unsigned denorm32 : 2; |
| unsigned denorm16_64 : 2; |
| }; |
| struct { |
| uint8_t round : 4; |
| uint8_t denorm : 4; |
| }; |
| uint8_t val = 0; |
| }; |
| /* if false, optimizations which may remove infs/nan/-0.0 can be done */ |
| bool preserve_signed_zero_inf_nan32 : 1; |
| bool preserve_signed_zero_inf_nan16_64 : 1; |
| /* if false, optimizations which may remove denormal flushing can be done */ |
| bool must_flush_denorms32 : 1; |
| bool must_flush_denorms16_64 : 1; |
| bool care_about_round32 : 1; |
| bool care_about_round16_64 : 1; |
| |
| /* Returns true if instructions using the mode "other" can safely use the |
| * current one instead. */ |
| bool canReplace(float_mode other) const noexcept |
| { |
| return val == other.val && |
| (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) && |
| (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) && |
| (must_flush_denorms32 || !other.must_flush_denorms32) && |
| (must_flush_denorms16_64 || !other.must_flush_denorms16_64) && |
| (care_about_round32 || !other.care_about_round32) && |
| (care_about_round16_64 || !other.care_about_round16_64); |
| } |
| }; |
| |
| struct wait_imm { |
| static const uint8_t unset_counter = 0xff; |
| |
| uint8_t vm; |
| uint8_t exp; |
| uint8_t lgkm; |
| uint8_t vs; |
| |
| wait_imm(); |
| wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_); |
| wait_imm(enum amd_gfx_level chip, uint16_t packed); |
| |
| uint16_t pack(enum amd_gfx_level chip) const; |
| |
| bool combine(const wait_imm& other); |
| |
| bool empty() const; |
| }; |
| |
| constexpr Format |
| asVOP3(Format format) |
| { |
| return (Format)((uint32_t)Format::VOP3 | (uint32_t)format); |
| }; |
| |
| constexpr Format |
| asSDWA(Format format) |
| { |
| assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC); |
| return (Format)((uint32_t)Format::SDWA | (uint32_t)format); |
| } |
| |
| constexpr Format |
| withoutDPP(Format format) |
| { |
| return (Format)((uint32_t)format & ~((uint32_t)Format::DPP16 | (uint32_t)Format::DPP8)); |
| } |
| |
| enum class RegType { |
| none = 0, |
| sgpr, |
| vgpr, |
| linear_vgpr, |
| }; |
| |
| struct RegClass { |
| |
| enum RC : uint8_t { |
| s1 = 1, |
| s2 = 2, |
| s3 = 3, |
| s4 = 4, |
| s6 = 6, |
| s8 = 8, |
| s16 = 16, |
| v1 = s1 | (1 << 5), |
| v2 = s2 | (1 << 5), |
| v3 = s3 | (1 << 5), |
| v4 = s4 | (1 << 5), |
| v5 = 5 | (1 << 5), |
| v6 = 6 | (1 << 5), |
| v7 = 7 | (1 << 5), |
| v8 = 8 | (1 << 5), |
| /* byte-sized register class */ |
| v1b = v1 | (1 << 7), |
| v2b = v2 | (1 << 7), |
| v3b = v3 | (1 << 7), |
| v4b = v4 | (1 << 7), |
| v6b = v6 | (1 << 7), |
| v8b = v8 | (1 << 7), |
| /* these are used for WWM and spills to vgpr */ |
| v1_linear = v1 | (1 << 6), |
| v2_linear = v2 | (1 << 6), |
| }; |
| |
| RegClass() = default; |
| constexpr RegClass(RC rc_) : rc(rc_) {} |
| constexpr RegClass(RegType type, unsigned size) |
| : rc((RC)((type == RegType::vgpr ? 1 << 5 : 0) | size)) |
| {} |
| |
| constexpr operator RC() const { return rc; } |
| explicit operator bool() = delete; |
| |
| constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; } |
| constexpr bool is_linear_vgpr() const { return rc & (1 << 6); }; |
| constexpr bool is_subdword() const { return rc & (1 << 7); } |
| constexpr unsigned bytes() const { return ((unsigned)rc & 0x1F) * (is_subdword() ? 1 : 4); } |
| // TODO: use size() less in favor of bytes() |
| constexpr unsigned size() const { return (bytes() + 3) >> 2; } |
| constexpr bool is_linear() const { return rc <= RC::s16 || is_linear_vgpr(); } |
| constexpr RegClass as_linear() const { return RegClass((RC)(rc | (1 << 6))); } |
| constexpr RegClass as_subdword() const { return RegClass((RC)(rc | 1 << 7)); } |
| |
| static constexpr RegClass get(RegType type, unsigned bytes) |
| { |
| if (type == RegType::sgpr) { |
| return RegClass(type, DIV_ROUND_UP(bytes, 4u)); |
| } else { |
| return bytes % 4u ? RegClass(type, bytes).as_subdword() : RegClass(type, bytes / 4u); |
| } |
| } |
| |
| constexpr RegClass resize(unsigned bytes) const |
| { |
| if (is_linear_vgpr()) { |
| assert(bytes % 4u == 0); |
| return get(RegType::vgpr, bytes).as_linear(); |
| } |
| return get(type(), bytes); |
| } |
| |
| private: |
| RC rc; |
| }; |
| |
| /* transitional helper expressions */ |
| static constexpr RegClass s1{RegClass::s1}; |
| static constexpr RegClass s2{RegClass::s2}; |
| static constexpr RegClass s3{RegClass::s3}; |
| static constexpr RegClass s4{RegClass::s4}; |
| static constexpr RegClass s8{RegClass::s8}; |
| static constexpr RegClass s16{RegClass::s16}; |
| static constexpr RegClass v1{RegClass::v1}; |
| static constexpr RegClass v2{RegClass::v2}; |
| static constexpr RegClass v3{RegClass::v3}; |
| static constexpr RegClass v4{RegClass::v4}; |
| static constexpr RegClass v5{RegClass::v5}; |
| static constexpr RegClass v6{RegClass::v6}; |
| static constexpr RegClass v7{RegClass::v7}; |
| static constexpr RegClass v8{RegClass::v8}; |
| static constexpr RegClass v1b{RegClass::v1b}; |
| static constexpr RegClass v2b{RegClass::v2b}; |
| static constexpr RegClass v3b{RegClass::v3b}; |
| static constexpr RegClass v4b{RegClass::v4b}; |
| static constexpr RegClass v6b{RegClass::v6b}; |
| static constexpr RegClass v8b{RegClass::v8b}; |
| |
| /** |
| * Temp Class |
| * Each temporary virtual register has a |
| * register class (i.e. size and type) |
| * and SSA id. |
| */ |
| struct Temp { |
| Temp() noexcept : id_(0), reg_class(0) {} |
| constexpr Temp(uint32_t id, RegClass cls) noexcept : id_(id), reg_class(uint8_t(cls)) {} |
| |
| constexpr uint32_t id() const noexcept { return id_; } |
| constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; } |
| |
| constexpr unsigned bytes() const noexcept { return regClass().bytes(); } |
| constexpr unsigned size() const noexcept { return regClass().size(); } |
| constexpr RegType type() const noexcept { return regClass().type(); } |
| constexpr bool is_linear() const noexcept { return regClass().is_linear(); } |
| |
| constexpr bool operator<(Temp other) const noexcept { return id() < other.id(); } |
| constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); } |
| constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); } |
| |
| private: |
| uint32_t id_ : 24; |
| uint32_t reg_class : 8; |
| }; |
| |
| /** |
| * PhysReg |
| * Represents the physical register for each |
| * Operand and Definition. |
| */ |
| struct PhysReg { |
| constexpr PhysReg() = default; |
| explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {} |
| constexpr unsigned reg() const { return reg_b >> 2; } |
| constexpr unsigned byte() const { return reg_b & 0x3; } |
| constexpr operator unsigned() const { return reg(); } |
| constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; } |
| constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; } |
| constexpr bool operator<(PhysReg other) const { return reg_b < other.reg_b; } |
| constexpr PhysReg advance(int bytes) const |
| { |
| PhysReg res = *this; |
| res.reg_b += bytes; |
| return res; |
| } |
| |
| uint16_t reg_b = 0; |
| }; |
| |
| /* helper expressions for special registers */ |
| static constexpr PhysReg m0{124}; |
| static constexpr PhysReg flat_scr_lo{102}; /* GFX8-GFX9, encoded differently on GFX6-7 */ |
| static constexpr PhysReg flat_scr_hi{103}; /* GFX8-GFX9, encoded differently on GFX6-7 */ |
| static constexpr PhysReg vcc{106}; |
| static constexpr PhysReg vcc_hi{107}; |
| static constexpr PhysReg tba{108}; /* GFX6-GFX8 */ |
| static constexpr PhysReg tma{110}; /* GFX6-GFX8 */ |
| static constexpr PhysReg ttmp0{112}; |
| static constexpr PhysReg ttmp1{113}; |
| static constexpr PhysReg ttmp2{114}; |
| static constexpr PhysReg ttmp3{115}; |
| static constexpr PhysReg ttmp4{116}; |
| static constexpr PhysReg ttmp5{117}; |
| static constexpr PhysReg ttmp6{118}; |
| static constexpr PhysReg ttmp7{119}; |
| static constexpr PhysReg ttmp8{120}; |
| static constexpr PhysReg ttmp9{121}; |
| static constexpr PhysReg ttmp10{122}; |
| static constexpr PhysReg ttmp11{123}; |
| static constexpr PhysReg sgpr_null{125}; /* GFX10+ */ |
| static constexpr PhysReg exec{126}; |
| static constexpr PhysReg exec_lo{126}; |
| static constexpr PhysReg exec_hi{127}; |
| static constexpr PhysReg vccz{251}; |
| static constexpr PhysReg execz{252}; |
| static constexpr PhysReg scc{253}; |
| |
| /** |
| * Operand Class |
| * Initially, each Operand refers to either |
| * a temporary virtual register |
| * or to a constant value |
| * Temporary registers get mapped to physical register during RA |
| * Constant values are inlined into the instruction sequence. |
| */ |
| class Operand final { |
| public: |
| constexpr Operand() |
| : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), isKill_(false), |
| isUndef_(true), isFirstKill_(false), constSize(0), isLateKill_(false), is16bit_(false), |
| is24bit_(false), signext(false) |
| {} |
| |
| explicit Operand(Temp r) noexcept |
| { |
| data_.temp = r; |
| if (r.id()) { |
| isTemp_ = true; |
| } else { |
| isUndef_ = true; |
| setFixed(PhysReg{128}); |
| } |
| }; |
| explicit Operand(Temp r, PhysReg reg) noexcept |
| { |
| assert(r.id()); /* Don't allow fixing an undef to a register */ |
| data_.temp = r; |
| isTemp_ = true; |
| setFixed(reg); |
| }; |
| |
| /* 8-bit constant */ |
| static Operand c8(uint8_t v) noexcept |
| { |
| /* 8-bit constants are only used for copies and copies from any 8-bit |
| * constant can be implemented with a SDWA v_mul_u32_u24. So consider all |
| * to be inline constants. */ |
| Operand op; |
| op.control_ = 0; |
| op.data_.i = v; |
| op.isConstant_ = true; |
| op.constSize = 0; |
| op.setFixed(PhysReg{0u}); |
| return op; |
| }; |
| |
| /* 16-bit constant */ |
| static Operand c16(uint16_t v) noexcept |
| { |
| Operand op; |
| op.control_ = 0; |
| op.data_.i = v; |
| op.isConstant_ = true; |
| op.constSize = 1; |
| if (v <= 64) |
| op.setFixed(PhysReg{128u + v}); |
| else if (v >= 0xFFF0) /* [-16 .. -1] */ |
| op.setFixed(PhysReg{(unsigned)(192 - (int16_t)v)}); |
| else if (v == 0x3800) /* 0.5 */ |
| op.setFixed(PhysReg{240}); |
| else if (v == 0xB800) /* -0.5 */ |
| op.setFixed(PhysReg{241}); |
| else if (v == 0x3C00) /* 1.0 */ |
| op.setFixed(PhysReg{242}); |
| else if (v == 0xBC00) /* -1.0 */ |
| op.setFixed(PhysReg{243}); |
| else if (v == 0x4000) /* 2.0 */ |
| op.setFixed(PhysReg{244}); |
| else if (v == 0xC000) /* -2.0 */ |
| op.setFixed(PhysReg{245}); |
| else if (v == 0x4400) /* 4.0 */ |
| op.setFixed(PhysReg{246}); |
| else if (v == 0xC400) /* -4.0 */ |
| op.setFixed(PhysReg{247}); |
| else if (v == 0x3118) /* 1/2 PI */ |
| op.setFixed(PhysReg{248}); |
| else /* Literal Constant */ |
| op.setFixed(PhysReg{255}); |
| return op; |
| } |
| |
| /* 32-bit constant */ |
| static Operand c32(uint32_t v) noexcept { return c32_or_c64(v, false); } |
| |
| /* 64-bit constant */ |
| static Operand c64(uint64_t v) noexcept |
| { |
| Operand op; |
| op.control_ = 0; |
| op.isConstant_ = true; |
| op.constSize = 3; |
| if (v <= 64) { |
| op.data_.i = (uint32_t)v; |
| op.setFixed(PhysReg{128 + (uint32_t)v}); |
| } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */ |
| op.data_.i = (uint32_t)v; |
| op.setFixed(PhysReg{192 - (uint32_t)v}); |
| } else if (v == 0x3FE0000000000000) { /* 0.5 */ |
| op.data_.i = 0x3f000000; |
| op.setFixed(PhysReg{240}); |
| } else if (v == 0xBFE0000000000000) { /* -0.5 */ |
| op.data_.i = 0xbf000000; |
| op.setFixed(PhysReg{241}); |
| } else if (v == 0x3FF0000000000000) { /* 1.0 */ |
| op.data_.i = 0x3f800000; |
| op.setFixed(PhysReg{242}); |
| } else if (v == 0xBFF0000000000000) { /* -1.0 */ |
| op.data_.i = 0xbf800000; |
| op.setFixed(PhysReg{243}); |
| } else if (v == 0x4000000000000000) { /* 2.0 */ |
| op.data_.i = 0x40000000; |
| op.setFixed(PhysReg{244}); |
| } else if (v == 0xC000000000000000) { /* -2.0 */ |
| op.data_.i = 0xc0000000; |
| op.setFixed(PhysReg{245}); |
| } else if (v == 0x4010000000000000) { /* 4.0 */ |
| op.data_.i = 0x40800000; |
| op.setFixed(PhysReg{246}); |
| } else if (v == 0xC010000000000000) { /* -4.0 */ |
| op.data_.i = 0xc0800000; |
| op.setFixed(PhysReg{247}); |
| } else { /* Literal Constant: we don't know if it is a long or double.*/ |
| op.signext = v >> 63; |
| op.data_.i = v & 0xffffffffu; |
| op.setFixed(PhysReg{255}); |
| assert(op.constantValue64() == v && |
| "attempt to create a unrepresentable 64-bit literal constant"); |
| } |
| return op; |
| } |
| |
| /* 32-bit constant stored as a 32-bit or 64-bit operand */ |
| static Operand c32_or_c64(uint32_t v, bool is64bit) noexcept |
| { |
| Operand op; |
| op.control_ = 0; |
| op.data_.i = v; |
| op.isConstant_ = true; |
| op.constSize = is64bit ? 3 : 2; |
| if (v <= 64) |
| op.setFixed(PhysReg{128 + v}); |
| else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */ |
| op.setFixed(PhysReg{192 - v}); |
| else if (v == 0x3f000000) /* 0.5 */ |
| op.setFixed(PhysReg{240}); |
| else if (v == 0xbf000000) /* -0.5 */ |
| op.setFixed(PhysReg{241}); |
| else if (v == 0x3f800000) /* 1.0 */ |
| op.setFixed(PhysReg{242}); |
| else if (v == 0xbf800000) /* -1.0 */ |
| op.setFixed(PhysReg{243}); |
| else if (v == 0x40000000) /* 2.0 */ |
| op.setFixed(PhysReg{244}); |
| else if (v == 0xc0000000) /* -2.0 */ |
| op.setFixed(PhysReg{245}); |
| else if (v == 0x40800000) /* 4.0 */ |
| op.setFixed(PhysReg{246}); |
| else if (v == 0xc0800000) /* -4.0 */ |
| op.setFixed(PhysReg{247}); |
| else { /* Literal Constant */ |
| assert(!is64bit && "attempt to create a 64-bit literal constant"); |
| op.setFixed(PhysReg{255}); |
| } |
| return op; |
| } |
| |
| static Operand literal32(uint32_t v) noexcept |
| { |
| Operand op; |
| op.control_ = 0; |
| op.data_.i = v; |
| op.isConstant_ = true; |
| op.constSize = 2; |
| op.setFixed(PhysReg{255}); |
| return op; |
| } |
| |
| explicit Operand(RegClass type) noexcept |
| { |
| isUndef_ = true; |
| data_.temp = Temp(0, type); |
| setFixed(PhysReg{128}); |
| }; |
| explicit Operand(PhysReg reg, RegClass type) noexcept |
| { |
| data_.temp = Temp(0, type); |
| setFixed(reg); |
| } |
| |
| static Operand zero(unsigned bytes = 4) noexcept |
| { |
| if (bytes == 8) |
| return Operand::c64(0); |
| else if (bytes == 4) |
| return Operand::c32(0); |
| else if (bytes == 2) |
| return Operand::c16(0); |
| assert(bytes == 1); |
| return Operand::c8(0); |
| } |
| |
| /* This is useful over the constructors when you want to take a gfx level |
| * for 1/2 PI or an unknown operand size. |
| */ |
| static Operand get_const(enum amd_gfx_level chip, uint64_t val, unsigned bytes) |
| { |
| if (val == 0x3e22f983 && bytes == 4 && chip >= GFX8) { |
| /* 1/2 PI can be an inline constant on GFX8+ */ |
| Operand op = Operand::c32(val); |
| op.setFixed(PhysReg{248}); |
| return op; |
| } |
| |
| if (bytes == 8) |
| return Operand::c64(val); |
| else if (bytes == 4) |
| return Operand::c32(val); |
| else if (bytes == 2) |
| return Operand::c16(val); |
| assert(bytes == 1); |
| return Operand::c8(val); |
| } |
| |
| static bool is_constant_representable(uint64_t val, unsigned bytes, bool zext = false, |
| bool sext = false) |
| { |
| if (bytes <= 4) |
| return true; |
| |
| if (zext && (val & 0xFFFFFFFF00000000) == 0x0000000000000000) |
| return true; |
| uint64_t upper33 = val & 0xFFFFFFFF80000000; |
| if (sext && (upper33 == 0xFFFFFFFF80000000 || upper33 == 0)) |
| return true; |
| |
| return val >= 0xFFFFFFFFFFFFFFF0 || val <= 64 || /* [-16 .. 64] */ |
| val == 0x3FE0000000000000 || /* 0.5 */ |
| val == 0xBFE0000000000000 || /* -0.5 */ |
| val == 0x3FF0000000000000 || /* 1.0 */ |
| val == 0xBFF0000000000000 || /* -1.0 */ |
| val == 0x4000000000000000 || /* 2.0 */ |
| val == 0xC000000000000000 || /* -2.0 */ |
| val == 0x4010000000000000 || /* 4.0 */ |
| val == 0xC010000000000000; /* -4.0 */ |
| } |
| |
| constexpr bool isTemp() const noexcept { return isTemp_; } |
| |
| constexpr void setTemp(Temp t) noexcept |
| { |
| assert(!isConstant_); |
| isTemp_ = true; |
| data_.temp = t; |
| } |
| |
| constexpr Temp getTemp() const noexcept { return data_.temp; } |
| |
| constexpr uint32_t tempId() const noexcept { return data_.temp.id(); } |
| |
| constexpr bool hasRegClass() const noexcept { return isTemp() || isUndefined(); } |
| |
| constexpr RegClass regClass() const noexcept { return data_.temp.regClass(); } |
| |
| constexpr unsigned bytes() const noexcept |
| { |
| if (isConstant()) |
| return 1 << constSize; |
| else |
| return data_.temp.bytes(); |
| } |
| |
| constexpr unsigned size() const noexcept |
| { |
| if (isConstant()) |
| return constSize > 2 ? 2 : 1; |
| else |
| return data_.temp.size(); |
| } |
| |
| constexpr bool isFixed() const noexcept { return isFixed_; } |
| |
| constexpr PhysReg physReg() const noexcept { return reg_; } |
| |
| constexpr void setFixed(PhysReg reg) noexcept |
| { |
| isFixed_ = reg != unsigned(-1); |
| reg_ = reg; |
| } |
| |
| constexpr bool isConstant() const noexcept { return isConstant_; } |
| |
| constexpr bool isLiteral() const noexcept { return isConstant() && reg_ == 255; } |
| |
| constexpr bool isUndefined() const noexcept { return isUndef_; } |
| |
| constexpr uint32_t constantValue() const noexcept { return data_.i; } |
| |
| constexpr bool constantEquals(uint32_t cmp) const noexcept |
| { |
| return isConstant() && constantValue() == cmp; |
| } |
| |
| constexpr uint64_t constantValue64() const noexcept |
| { |
| if (constSize == 3) { |
| if (reg_ <= 192) |
| return reg_ - 128; |
| else if (reg_ <= 208) |
| return 0xFFFFFFFFFFFFFFFF - (reg_ - 193); |
| |
| switch (reg_) { |
| case 240: return 0x3FE0000000000000; |
| case 241: return 0xBFE0000000000000; |
| case 242: return 0x3FF0000000000000; |
| case 243: return 0xBFF0000000000000; |
| case 244: return 0x4000000000000000; |
| case 245: return 0xC000000000000000; |
| case 246: return 0x4010000000000000; |
| case 247: return 0xC010000000000000; |
| case 255: |
| return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i; |
| } |
| unreachable("invalid register for 64-bit constant"); |
| } else { |
| return data_.i; |
| } |
| } |
| |
| /* Value if this were used with vop3/opsel or vop3p. */ |
| constexpr uint16_t constantValue16(bool opsel) const noexcept |
| { |
| assert(bytes() == 2 || bytes() == 4); |
| if (opsel) { |
| if (bytes() == 2 && int16_t(data_.i) >= -16 && int16_t(data_.i) <= 64 && !isLiteral()) |
| return int16_t(data_.i) >> 16; /* 16-bit inline integers are sign-extended, even with fp16 instrs */ |
| else |
| return data_.i >> 16; |
| } |
| return data_.i; |
| } |
| |
| constexpr bool isOfType(RegType type) const noexcept |
| { |
| return hasRegClass() && regClass().type() == type; |
| } |
| |
| /* Indicates that the killed operand's live range intersects with the |
| * instruction's definitions. Unlike isKill() and isFirstKill(), this is |
| * not set by liveness analysis. */ |
| constexpr void setLateKill(bool flag) noexcept { isLateKill_ = flag; } |
| |
| constexpr bool isLateKill() const noexcept { return isLateKill_; } |
| |
| constexpr void setKill(bool flag) noexcept |
| { |
| isKill_ = flag; |
| if (!flag) |
| setFirstKill(false); |
| } |
| |
| constexpr bool isKill() const noexcept { return isKill_ || isFirstKill(); } |
| |
| constexpr void setFirstKill(bool flag) noexcept |
| { |
| isFirstKill_ = flag; |
| if (flag) |
| setKill(flag); |
| } |
| |
| /* When there are multiple operands killing the same temporary, |
| * isFirstKill() is only returns true for the first one. */ |
| constexpr bool isFirstKill() const noexcept { return isFirstKill_; } |
| |
| constexpr bool isKillBeforeDef() const noexcept { return isKill() && !isLateKill(); } |
| |
| constexpr bool isFirstKillBeforeDef() const noexcept { return isFirstKill() && !isLateKill(); } |
| |
| constexpr bool operator==(Operand other) const noexcept |
| { |
| if (other.size() != size()) |
| return false; |
| if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef()) |
| return false; |
| if (isFixed() && other.isFixed() && physReg() != other.physReg()) |
| return false; |
| if (isLiteral()) |
| return other.isLiteral() && other.constantValue() == constantValue(); |
| else if (isConstant()) |
| return other.isConstant() && other.physReg() == physReg(); |
| else if (isUndefined()) |
| return other.isUndefined() && other.regClass() == regClass(); |
| else |
| return other.isTemp() && other.getTemp() == getTemp(); |
| } |
| |
| constexpr bool operator!=(Operand other) const noexcept { return !operator==(other); } |
| |
| constexpr void set16bit(bool flag) noexcept { is16bit_ = flag; } |
| |
| constexpr bool is16bit() const noexcept { return is16bit_; } |
| |
| constexpr void set24bit(bool flag) noexcept { is24bit_ = flag; } |
| |
| constexpr bool is24bit() const noexcept { return is24bit_; } |
| |
| private: |
| union { |
| Temp temp; |
| uint32_t i; |
| float f; |
| } data_ = {Temp(0, s1)}; |
| PhysReg reg_; |
| union { |
| struct { |
| uint8_t isTemp_ : 1; |
| uint8_t isFixed_ : 1; |
| uint8_t isConstant_ : 1; |
| uint8_t isKill_ : 1; |
| uint8_t isUndef_ : 1; |
| uint8_t isFirstKill_ : 1; |
| uint8_t constSize : 2; |
| uint8_t isLateKill_ : 1; |
| uint8_t is16bit_ : 1; |
| uint8_t is24bit_ : 1; |
| uint8_t signext : 1; |
| }; |
| /* can't initialize bit-fields in c++11, so work around using a union */ |
| uint16_t control_ = 0; |
| }; |
| }; |
| |
| /** |
| * Definition Class |
| * Definitions are the results of Instructions |
| * and refer to temporary virtual registers |
| * which are later mapped to physical registers |
| */ |
| class Definition final { |
| public: |
| constexpr Definition() |
| : temp(Temp(0, s1)), reg_(0), isFixed_(0), isKill_(0), isPrecise_(0), isNUW_(0), isNoCSE_(0) |
| {} |
| Definition(uint32_t index, RegClass type) noexcept : temp(index, type) {} |
| explicit Definition(Temp tmp) noexcept : temp(tmp) {} |
| Definition(PhysReg reg, RegClass type) noexcept : temp(Temp(0, type)) { setFixed(reg); } |
| Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept : temp(Temp(tmpId, type)) |
| { |
| setFixed(reg); |
| } |
| |
| constexpr bool isTemp() const noexcept { return tempId() > 0; } |
| |
| constexpr Temp getTemp() const noexcept { return temp; } |
| |
| constexpr uint32_t tempId() const noexcept { return temp.id(); } |
| |
| constexpr void setTemp(Temp t) noexcept { temp = t; } |
| |
| void swapTemp(Definition& other) noexcept { std::swap(temp, other.temp); } |
| |
| constexpr RegClass regClass() const noexcept { return temp.regClass(); } |
| |
| constexpr unsigned bytes() const noexcept { return temp.bytes(); } |
| |
| constexpr unsigned size() const noexcept { return temp.size(); } |
| |
| constexpr bool isFixed() const noexcept { return isFixed_; } |
| |
| constexpr PhysReg physReg() const noexcept { return reg_; } |
| |
| constexpr void setFixed(PhysReg reg) noexcept |
| { |
| isFixed_ = 1; |
| reg_ = reg; |
| } |
| |
| constexpr void setKill(bool flag) noexcept { isKill_ = flag; } |
| |
| constexpr bool isKill() const noexcept { return isKill_; } |
| |
| constexpr void setPrecise(bool precise) noexcept { isPrecise_ = precise; } |
| |
| constexpr bool isPrecise() const noexcept { return isPrecise_; } |
| |
| /* No Unsigned Wrap */ |
| constexpr void setNUW(bool nuw) noexcept { isNUW_ = nuw; } |
| |
| constexpr bool isNUW() const noexcept { return isNUW_; } |
| |
| constexpr void setNoCSE(bool noCSE) noexcept { isNoCSE_ = noCSE; } |
| |
| constexpr bool isNoCSE() const noexcept { return isNoCSE_; } |
| |
| private: |
| Temp temp = Temp(0, s1); |
| PhysReg reg_; |
| union { |
| struct { |
| uint8_t isFixed_ : 1; |
| uint8_t isKill_ : 1; |
| uint8_t isPrecise_ : 1; |
| uint8_t isNUW_ : 1; |
| uint8_t isNoCSE_ : 1; |
| }; |
| /* can't initialize bit-fields in c++11, so work around using a union */ |
| uint8_t control_ = 0; |
| }; |
| }; |
| |
| struct Block; |
| struct Instruction; |
| struct Pseudo_instruction; |
| struct SOP1_instruction; |
| struct SOP2_instruction; |
| struct SOPK_instruction; |
| struct SOPP_instruction; |
| struct SOPC_instruction; |
| struct SMEM_instruction; |
| struct DS_instruction; |
| struct LDSDIR_instruction; |
| struct MTBUF_instruction; |
| struct MUBUF_instruction; |
| struct MIMG_instruction; |
| struct Export_instruction; |
| struct FLAT_instruction; |
| struct Pseudo_branch_instruction; |
| struct Pseudo_barrier_instruction; |
| struct Pseudo_reduction_instruction; |
| struct VOP3P_instruction; |
| struct VINTERP_inreg_instruction; |
| struct VOP1_instruction; |
| struct VOP2_instruction; |
| struct VOPC_instruction; |
| struct VOP3_instruction; |
| struct VINTRP_instruction; |
| struct DPP16_instruction; |
| struct DPP8_instruction; |
| struct SDWA_instruction; |
| |
| struct Instruction { |
| aco_opcode opcode; |
| Format format; |
| uint32_t pass_flags; |
| |
| aco::span<Operand> operands; |
| aco::span<Definition> definitions; |
| |
| constexpr bool usesModifiers() const noexcept; |
| |
| constexpr bool reads_exec() const noexcept |
| { |
| for (const Operand& op : operands) { |
| if (op.isFixed() && op.physReg() == exec) |
| return true; |
| } |
| return false; |
| } |
| |
| Pseudo_instruction& pseudo() noexcept |
| { |
| assert(isPseudo()); |
| return *(Pseudo_instruction*)this; |
| } |
| const Pseudo_instruction& pseudo() const noexcept |
| { |
| assert(isPseudo()); |
| return *(Pseudo_instruction*)this; |
| } |
| constexpr bool isPseudo() const noexcept { return format == Format::PSEUDO; } |
| SOP1_instruction& sop1() noexcept |
| { |
| assert(isSOP1()); |
| return *(SOP1_instruction*)this; |
| } |
| const SOP1_instruction& sop1() const noexcept |
| { |
| assert(isSOP1()); |
| return *(SOP1_instruction*)this; |
| } |
| constexpr bool isSOP1() const noexcept { return format == Format::SOP1; } |
| SOP2_instruction& sop2() noexcept |
| { |
| assert(isSOP2()); |
| return *(SOP2_instruction*)this; |
| } |
| const SOP2_instruction& sop2() const noexcept |
| { |
| assert(isSOP2()); |
| return *(SOP2_instruction*)this; |
| } |
| constexpr bool isSOP2() const noexcept { return format == Format::SOP2; } |
| SOPK_instruction& sopk() noexcept |
| { |
| assert(isSOPK()); |
| return *(SOPK_instruction*)this; |
| } |
| const SOPK_instruction& sopk() const noexcept |
| { |
| assert(isSOPK()); |
| return *(SOPK_instruction*)this; |
| } |
| constexpr bool isSOPK() const noexcept { return format == Format::SOPK; } |
| SOPP_instruction& sopp() noexcept |
| { |
| assert(isSOPP()); |
| return *(SOPP_instruction*)this; |
| } |
| const SOPP_instruction& sopp() const noexcept |
| { |
| assert(isSOPP()); |
| return *(SOPP_instruction*)this; |
| } |
| constexpr bool isSOPP() const noexcept { return format == Format::SOPP; } |
| SOPC_instruction& sopc() noexcept |
| { |
| assert(isSOPC()); |
| return *(SOPC_instruction*)this; |
| } |
| const SOPC_instruction& sopc() const noexcept |
| { |
| assert(isSOPC()); |
| return *(SOPC_instruction*)this; |
| } |
| constexpr bool isSOPC() const noexcept { return format == Format::SOPC; } |
| SMEM_instruction& smem() noexcept |
| { |
| assert(isSMEM()); |
| return *(SMEM_instruction*)this; |
| } |
| const SMEM_instruction& smem() const noexcept |
| { |
| assert(isSMEM()); |
| return *(SMEM_instruction*)this; |
| } |
| constexpr bool isSMEM() const noexcept { return format == Format::SMEM; } |
| DS_instruction& ds() noexcept |
| { |
| assert(isDS()); |
| return *(DS_instruction*)this; |
| } |
| const DS_instruction& ds() const noexcept |
| { |
| assert(isDS()); |
| return *(DS_instruction*)this; |
| } |
| constexpr bool isDS() const noexcept { return format == Format::DS; } |
| LDSDIR_instruction& ldsdir() noexcept |
| { |
| assert(isLDSDIR()); |
| return *(LDSDIR_instruction*)this; |
| } |
| const LDSDIR_instruction& ldsdir() const noexcept |
| { |
| assert(isLDSDIR()); |
| return *(LDSDIR_instruction*)this; |
| } |
| constexpr bool isLDSDIR() const noexcept { return format == Format::LDSDIR; } |
| MTBUF_instruction& mtbuf() noexcept |
| { |
| assert(isMTBUF()); |
| return *(MTBUF_instruction*)this; |
| } |
| const MTBUF_instruction& mtbuf() const noexcept |
| { |
| assert(isMTBUF()); |
| return *(MTBUF_instruction*)this; |
| } |
| constexpr bool isMTBUF() const noexcept { return format == Format::MTBUF; } |
| MUBUF_instruction& mubuf() noexcept |
| { |
| assert(isMUBUF()); |
| return *(MUBUF_instruction*)this; |
| } |
| const MUBUF_instruction& mubuf() const noexcept |
| { |
| assert(isMUBUF()); |
| return *(MUBUF_instruction*)this; |
| } |
| constexpr bool isMUBUF() const noexcept { return format == Format::MUBUF; } |
| MIMG_instruction& mimg() noexcept |
| { |
| assert(isMIMG()); |
| return *(MIMG_instruction*)this; |
| } |
| const MIMG_instruction& mimg() const noexcept |
| { |
| assert(isMIMG()); |
| return *(MIMG_instruction*)this; |
| } |
| constexpr bool isMIMG() const noexcept { return format == Format::MIMG; } |
| Export_instruction& exp() noexcept |
| { |
| assert(isEXP()); |
| return *(Export_instruction*)this; |
| } |
| const Export_instruction& exp() const noexcept |
| { |
| assert(isEXP()); |
| return *(Export_instruction*)this; |
| } |
| constexpr bool isEXP() const noexcept { return format == Format::EXP; } |
| FLAT_instruction& flat() noexcept |
| { |
| assert(isFlat()); |
| return *(FLAT_instruction*)this; |
| } |
| const FLAT_instruction& flat() const noexcept |
| { |
| assert(isFlat()); |
| return *(FLAT_instruction*)this; |
| } |
| constexpr bool isFlat() const noexcept { return format == Format::FLAT; } |
| FLAT_instruction& global() noexcept |
| { |
| assert(isGlobal()); |
| return *(FLAT_instruction*)this; |
| } |
| const FLAT_instruction& global() const noexcept |
| { |
| assert(isGlobal()); |
| return *(FLAT_instruction*)this; |
| } |
| constexpr bool isGlobal() const noexcept { return format == Format::GLOBAL; } |
| FLAT_instruction& scratch() noexcept |
| { |
| assert(isScratch()); |
| return *(FLAT_instruction*)this; |
| } |
| const FLAT_instruction& scratch() const noexcept |
| { |
| assert(isScratch()); |
| return *(FLAT_instruction*)this; |
| } |
| constexpr bool isScratch() const noexcept { return format == Format::SCRATCH; } |
| Pseudo_branch_instruction& branch() noexcept |
| { |
| assert(isBranch()); |
| return *(Pseudo_branch_instruction*)this; |
| } |
| const Pseudo_branch_instruction& branch() const noexcept |
| { |
| assert(isBranch()); |
| return *(Pseudo_branch_instruction*)this; |
| } |
| constexpr bool isBranch() const noexcept { return format == Format::PSEUDO_BRANCH; } |
| Pseudo_barrier_instruction& barrier() noexcept |
| { |
| assert(isBarrier()); |
| return *(Pseudo_barrier_instruction*)this; |
| } |
| const Pseudo_barrier_instruction& barrier() const noexcept |
| { |
| assert(isBarrier()); |
| return *(Pseudo_barrier_instruction*)this; |
| } |
| constexpr bool isBarrier() const noexcept { return format == Format::PSEUDO_BARRIER; } |
| Pseudo_reduction_instruction& reduction() noexcept |
| { |
| assert(isReduction()); |
| return *(Pseudo_reduction_instruction*)this; |
| } |
| const Pseudo_reduction_instruction& reduction() const noexcept |
| { |
| assert(isReduction()); |
| return *(Pseudo_reduction_instruction*)this; |
| } |
| constexpr bool isReduction() const noexcept { return format == Format::PSEUDO_REDUCTION; } |
| VOP3P_instruction& vop3p() noexcept |
| { |
| assert(isVOP3P()); |
| return *(VOP3P_instruction*)this; |
| } |
| const VOP3P_instruction& vop3p() const noexcept |
| { |
| assert(isVOP3P()); |
| return *(VOP3P_instruction*)this; |
| } |
| constexpr bool isVOP3P() const noexcept { return format == Format::VOP3P; } |
| VINTERP_inreg_instruction& vinterp_inreg() noexcept |
| { |
| assert(isVINTERP_INREG()); |
| return *(VINTERP_inreg_instruction*)this; |
| } |
| const VINTERP_inreg_instruction& vinterp_inreg() const noexcept |
| { |
| assert(isVINTERP_INREG()); |
| return *(VINTERP_inreg_instruction*)this; |
| } |
| constexpr bool isVINTERP_INREG() const noexcept { return format == Format::VINTERP_INREG; } |
| VOP1_instruction& vop1() noexcept |
| { |
| assert(isVOP1()); |
| return *(VOP1_instruction*)this; |
| } |
| const VOP1_instruction& vop1() const noexcept |
| { |
| assert(isVOP1()); |
| return *(VOP1_instruction*)this; |
| } |
| constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; } |
| VOP2_instruction& vop2() noexcept |
| { |
| assert(isVOP2()); |
| return *(VOP2_instruction*)this; |
| } |
| const VOP2_instruction& vop2() const noexcept |
| { |
| assert(isVOP2()); |
| return *(VOP2_instruction*)this; |
| } |
| constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; } |
| VOPC_instruction& vopc() noexcept |
| { |
| assert(isVOPC()); |
| return *(VOPC_instruction*)this; |
| } |
| const VOPC_instruction& vopc() const noexcept |
| { |
| assert(isVOPC()); |
| return *(VOPC_instruction*)this; |
| } |
| constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; } |
| VOP3_instruction& vop3() noexcept |
| { |
| assert(isVOP3()); |
| return *(VOP3_instruction*)this; |
| } |
| const VOP3_instruction& vop3() const noexcept |
| { |
| assert(isVOP3()); |
| return *(VOP3_instruction*)this; |
| } |
| constexpr bool isVOP3() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3; } |
| VINTRP_instruction& vintrp() noexcept |
| { |
| assert(isVINTRP()); |
| return *(VINTRP_instruction*)this; |
| } |
| const VINTRP_instruction& vintrp() const noexcept |
| { |
| assert(isVINTRP()); |
| return *(VINTRP_instruction*)this; |
| } |
| constexpr bool isVINTRP() const noexcept { return (uint16_t)format & (uint16_t)Format::VINTRP; } |
| DPP16_instruction& dpp16() noexcept |
| { |
| assert(isDPP16()); |
| return *(DPP16_instruction*)this; |
| } |
| const DPP16_instruction& dpp16() const noexcept |
| { |
| assert(isDPP16()); |
| return *(DPP16_instruction*)this; |
| } |
| constexpr bool isDPP16() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP16; } |
| DPP8_instruction& dpp8() noexcept |
| { |
| assert(isDPP8()); |
| return *(DPP8_instruction*)this; |
| } |
| const DPP8_instruction& dpp8() const noexcept |
| { |
| assert(isDPP8()); |
| return *(DPP8_instruction*)this; |
| } |
| constexpr bool isDPP8() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP8; } |
| constexpr bool isDPP() const noexcept { return isDPP16() || isDPP8(); } |
| SDWA_instruction& sdwa() noexcept |
| { |
| assert(isSDWA()); |
| return *(SDWA_instruction*)this; |
| } |
| const SDWA_instruction& sdwa() const noexcept |
| { |
| assert(isSDWA()); |
| return *(SDWA_instruction*)this; |
| } |
| constexpr bool isSDWA() const noexcept { return (uint16_t)format & (uint16_t)Format::SDWA; } |
| |
| FLAT_instruction& flatlike() { return *(FLAT_instruction*)this; } |
| |
| const FLAT_instruction& flatlike() const { return *(FLAT_instruction*)this; } |
| |
| constexpr bool isFlatLike() const noexcept { return isFlat() || isGlobal() || isScratch(); } |
| |
| constexpr bool isVALU() const noexcept |
| { |
| return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P(); |
| } |
| |
| constexpr bool isSALU() const noexcept |
| { |
| return isSOP1() || isSOP2() || isSOPC() || isSOPK() || isSOPP(); |
| } |
| |
| constexpr bool isVMEM() const noexcept { return isMTBUF() || isMUBUF() || isMIMG(); } |
| }; |
| static_assert(sizeof(Instruction) == 16, "Unexpected padding"); |
| |
| struct SOPK_instruction : public Instruction { |
| uint16_t imm; |
| uint16_t padding; |
| }; |
| static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); |
| |
| struct SOPP_instruction : public Instruction { |
| uint32_t imm; |
| int block; |
| }; |
| static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| struct SOPC_instruction : public Instruction { |
| uint32_t padding; |
| }; |
| static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); |
| |
| struct SOP1_instruction : public Instruction {}; |
| static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); |
| |
| struct SOP2_instruction : public Instruction { |
| uint32_t padding; |
| }; |
| static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); |
| |
| /** |
| * Scalar Memory Format: |
| * For s_(buffer_)load_dword*: |
| * Operand(0): SBASE - SGPR-pair which provides base address |
| * Operand(1): Offset - immediate (un)signed offset or SGPR |
| * Operand(2) / Definition(0): SDATA - SGPR for read / write result |
| * Operand(n-1): SOffset - SGPR offset (Vega only) |
| * |
| * Having no operands is also valid for instructions such as s_dcache_inv. |
| * |
| */ |
| struct SMEM_instruction : public Instruction { |
| memory_sync_info sync; |
| bool glc : 1; /* VI+: globally coherent */ |
| bool dlc : 1; /* NAVI: device level coherent */ |
| bool nv : 1; /* VEGA only: Non-volatile */ |
| bool disable_wqm : 1; |
| bool prevent_overflow : 1; /* avoid overflow when combining additions */ |
| uint8_t padding : 3; |
| }; |
| static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); |
| |
| struct VOP1_instruction : public Instruction {}; |
| static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); |
| |
| struct VOP2_instruction : public Instruction {}; |
| static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); |
| |
| struct VOPC_instruction : public Instruction {}; |
| static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); |
| |
| struct VOP3_instruction : public Instruction { |
| bool abs[3]; |
| bool neg[3]; |
| uint8_t opsel : 4; |
| uint8_t omod : 2; |
| bool clamp : 1; |
| uint8_t padding0 : 1; |
| uint8_t padding1; |
| }; |
| static_assert(sizeof(VOP3_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| struct VOP3P_instruction : public Instruction { |
| bool neg_lo[3]; |
| bool neg_hi[3]; /* abs modifier, for v_mad_mix/v_fma_mix */ |
| uint8_t opsel_lo : 3; |
| uint8_t opsel_hi : 3; |
| bool clamp : 1; |
| uint8_t padding0 : 1; |
| uint8_t padding1; |
| }; |
| static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| struct VINTERP_inreg_instruction : public Instruction { |
| uint8_t wait_exp : 3; |
| bool clamp : 1; |
| uint8_t opsel : 4; |
| bool neg[3]; |
| }; |
| static_assert(sizeof(VINTERP_inreg_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); |
| |
| /** |
| * Data Parallel Primitives Format: |
| * This format can be used for VOP1, VOP2 or VOPC instructions. |
| * The swizzle applies to the src0 operand. |
| * |
| */ |
| struct DPP16_instruction : public Instruction { |
| bool abs[2]; |
| bool neg[2]; |
| uint16_t dpp_ctrl; |
| uint8_t row_mask : 4; |
| uint8_t bank_mask : 4; |
| bool bound_ctrl : 1; |
| uint8_t padding : 7; |
| }; |
| static_assert(sizeof(DPP16_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| struct DPP8_instruction : public Instruction { |
| uint8_t lane_sel[8]; |
| }; |
| static_assert(sizeof(DPP8_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| struct SubdwordSel { |
| enum sdwa_sel : uint8_t { |
| ubyte = 0x4, |
| uword = 0x8, |
| dword = 0x10, |
| sext = 0x20, |
| sbyte = ubyte | sext, |
| sword = uword | sext, |
| |
| ubyte0 = ubyte, |
| ubyte1 = ubyte | 1, |
| ubyte2 = ubyte | 2, |
| ubyte3 = ubyte | 3, |
| sbyte0 = sbyte, |
| sbyte1 = sbyte | 1, |
| sbyte2 = sbyte | 2, |
| sbyte3 = sbyte | 3, |
| uword0 = uword, |
| uword1 = uword | 2, |
| sword0 = sword, |
| sword1 = sword | 2, |
| }; |
| |
| SubdwordSel() : sel((sdwa_sel)0) {} |
| constexpr SubdwordSel(sdwa_sel sel_) : sel(sel_) {} |
| constexpr SubdwordSel(unsigned size, unsigned offset, bool sign_extend) |
| : sel((sdwa_sel)((sign_extend ? sext : 0) | size << 2 | offset)) |
| {} |
| constexpr operator sdwa_sel() const { return sel; } |
| explicit operator bool() const { return sel != 0; } |
| |
| constexpr unsigned size() const { return (sel >> 2) & 0x7; } |
| constexpr unsigned offset() const { return sel & 0x3; } |
| constexpr bool sign_extend() const { return sel & sext; } |
| constexpr unsigned to_sdwa_sel(unsigned reg_byte_offset) const |
| { |
| reg_byte_offset += offset(); |
| if (size() == 1) |
| return reg_byte_offset; |
| else if (size() == 2) |
| return 4 + (reg_byte_offset >> 1); |
| else |
| return 6; |
| } |
| |
| private: |
| sdwa_sel sel; |
| }; |
| |
| /** |
| * Sub-Dword Addressing Format: |
| * This format can be used for VOP1, VOP2 or VOPC instructions. |
| * |
| * omod and SGPR/constant operands are only available on GFX9+. For VOPC, |
| * the definition doesn't have to be VCC on GFX9+. |
| * |
| */ |
| struct SDWA_instruction : public Instruction { |
| /* these destination modifiers aren't available with VOPC except for |
| * clamp on GFX8 */ |
| SubdwordSel sel[2]; |
| SubdwordSel dst_sel; |
| bool neg[2]; |
| bool abs[2]; |
| bool clamp : 1; |
| uint8_t omod : 2; /* GFX9+ */ |
| uint8_t padding : 5; |
| }; |
| static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| struct VINTRP_instruction : public Instruction { |
| uint8_t attribute; |
| uint8_t component; |
| uint16_t padding; |
| }; |
| static_assert(sizeof(VINTRP_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); |
| |
| /** |
| * Local and Global Data Sharing instructions |
| * Operand(0): ADDR - VGPR which supplies the address. |
| * Operand(1): DATA0 - First data VGPR. |
| * Operand(2): DATA1 - Second data VGPR. |
| * Operand(n-1): M0 - LDS size. |
| * Definition(0): VDST - Destination VGPR when results returned to VGPRs. |
| * |
| */ |
| struct DS_instruction : public Instruction { |
| memory_sync_info sync; |
| bool gds; |
| uint16_t offset0; |
| uint8_t offset1; |
| uint8_t padding; |
| }; |
| static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| /** |
| * LDS Direct instructions |
| * Operand(0): M0 |
| * Definition(0): VDST - Destination VGPR |
| */ |
| struct LDSDIR_instruction : public Instruction { |
| memory_sync_info sync; |
| uint8_t attr : 6; |
| uint8_t attr_chan : 2; |
| uint32_t wait_vdst : 4; |
| uint32_t padding : 28; |
| }; |
| static_assert(sizeof(LDSDIR_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| /** |
| * Vector Memory Untyped-buffer Instructions |
| * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant) |
| * Operand(1): VADDR - Address source. Can carry an index and/or offset |
| * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant) |
| * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data |
| * |
| */ |
| struct MUBUF_instruction : public Instruction { |
| memory_sync_info sync; |
| bool offen : 1; /* Supply an offset from VGPR (VADDR) */ |
| bool idxen : 1; /* Supply an index from VGPR (VADDR) */ |
| bool addr64 : 1; /* SI, CIK: Address size is 64-bit */ |
| bool glc : 1; /* globally coherent */ |
| bool dlc : 1; /* NAVI: device level coherent */ |
| bool slc : 1; /* system level coherent */ |
| bool tfe : 1; /* texture fail enable */ |
| bool lds : 1; /* Return read-data to LDS instead of VGPRs */ |
| uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */ |
| uint16_t offset : 12; /* Unsigned byte offset - 12 bit */ |
| uint16_t swizzled : 1; |
| uint16_t padding0 : 2; |
| uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */ |
| uint16_t padding1 : 10; |
| }; |
| static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| /** |
| * Vector Memory Typed-buffer Instructions |
| * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant) |
| * Operand(1): VADDR - Address source. Can carry an index and/or offset |
| * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant) |
| * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data |
| * |
| */ |
| struct MTBUF_instruction : public Instruction { |
| memory_sync_info sync; |
| uint8_t dfmt : 4; /* Data Format of data in memory buffer */ |
| uint8_t nfmt : 3; /* Numeric format of data in memory */ |
| bool offen : 1; /* Supply an offset from VGPR (VADDR) */ |
| uint16_t idxen : 1; /* Supply an index from VGPR (VADDR) */ |
| uint16_t glc : 1; /* globally coherent */ |
| uint16_t dlc : 1; /* NAVI: device level coherent */ |
| uint16_t slc : 1; /* system level coherent */ |
| uint16_t tfe : 1; /* texture fail enable */ |
| uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */ |
| uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */ |
| uint16_t padding : 4; |
| uint16_t offset; /* Unsigned byte offset - 12 bit */ |
| }; |
| static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| /** |
| * Vector Memory Image Instructions |
| * Operand(0) SRSRC - Scalar GPR that specifies the resource constant. |
| * Operand(1): SSAMP - Scalar GPR that specifies sampler constant. |
| * Operand(2): VDATA - Vector GPR for write data or zero if TFE/LWE=1. |
| * Operand(3): VADDR - Address source. Can carry an offset or an index. |
| * Definition(0): VDATA - Vector GPR for read result. |
| * |
| */ |
| struct MIMG_instruction : public Instruction { |
| memory_sync_info sync; |
| uint8_t dmask; /* Data VGPR enable mask */ |
| uint8_t dim : 3; /* NAVI: dimensionality */ |
| bool unrm : 1; /* Force address to be un-normalized */ |
| bool dlc : 1; /* NAVI: device level coherent */ |
| bool glc : 1; /* globally coherent */ |
| bool slc : 1; /* system level coherent */ |
| bool tfe : 1; /* texture fail enable */ |
| bool da : 1; /* declare an array */ |
| bool lwe : 1; /* LOD warning enable */ |
| bool r128 : 1; /* NAVI: Texture resource size */ |
| bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */ |
| bool d16 : 1; /* Convert 32-bit data to 16-bit data */ |
| bool disable_wqm : 1; /* Require an exec mask without helper invocations */ |
| uint8_t padding0 : 2; |
| uint8_t padding1; |
| uint8_t padding2; |
| }; |
| static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| /** |
| * Flat/Scratch/Global Instructions |
| * Operand(0): ADDR |
| * Operand(1): SADDR |
| * Operand(2) / Definition(0): DATA/VDST |
| * |
| */ |
| struct FLAT_instruction : public Instruction { |
| memory_sync_info sync; |
| bool slc : 1; /* system level coherent */ |
| bool glc : 1; /* globally coherent */ |
| bool dlc : 1; /* NAVI: device level coherent */ |
| bool lds : 1; |
| bool nv : 1; |
| bool disable_wqm : 1; /* Require an exec mask without helper invocations */ |
| uint8_t padding0 : 2; |
| int16_t offset; /* Vega/Navi only */ |
| uint16_t padding1; |
| }; |
| static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| struct Export_instruction : public Instruction { |
| uint8_t enabled_mask; |
| uint8_t dest; |
| bool compressed : 1; |
| bool done : 1; |
| bool valid_mask : 1; |
| bool row_en : 1; |
| uint8_t padding0 : 4; |
| uint8_t padding1; |
| }; |
| static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); |
| |
| struct Pseudo_instruction : public Instruction { |
| PhysReg scratch_sgpr; /* might not be valid if it's not needed */ |
| bool tmp_in_scc; |
| uint8_t padding; |
| }; |
| static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); |
| |
| struct Pseudo_branch_instruction : public Instruction { |
| /* target[0] is the block index of the branch target. |
| * For conditional branches, target[1] contains the fall-through alternative. |
| * A value of 0 means the target has not been initialized (BB0 cannot be a branch target). |
| */ |
| uint32_t target[2]; |
| nir_selection_control selection_control; |
| }; |
| static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 12, "Unexpected padding"); |
| |
| struct Pseudo_barrier_instruction : public Instruction { |
| memory_sync_info sync; |
| sync_scope exec_scope; |
| }; |
| static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); |
| |
| enum ReduceOp : uint16_t { |
| // clang-format off |
| iadd8, iadd16, iadd32, iadd64, |
| imul8, imul16, imul32, imul64, |
| fadd16, fadd32, fadd64, |
| fmul16, fmul32, fmul64, |
| imin8, imin16, imin32, imin64, |
| imax8, imax16, imax32, imax64, |
| umin8, umin16, umin32, umin64, |
| umax8, umax16, umax32, umax64, |
| fmin16, fmin32, fmin64, |
| fmax16, fmax32, fmax64, |
| iand8, iand16, iand32, iand64, |
| ior8, ior16, ior32, ior64, |
| ixor8, ixor16, ixor32, ixor64, |
| num_reduce_ops, |
| // clang-format on |
| }; |
| |
| /** |
| * Subgroup Reduction Instructions, everything except for the data to be |
| * reduced and the result as inserted by setup_reduce_temp(). |
| * Operand(0): data to be reduced |
| * Operand(1): reduce temporary |
| * Operand(2): vector temporary |
| * Definition(0): result |
| * Definition(1): scalar temporary |
| * Definition(2): scalar identity temporary (not used to store identity on GFX10) |
| * Definition(3): scc clobber |
| * Definition(4): vcc clobber |
| * |
| */ |
| struct Pseudo_reduction_instruction : public Instruction { |
| ReduceOp reduce_op; |
| uint16_t cluster_size; // must be 0 for scans |
| }; |
| static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, |
| "Unexpected padding"); |
| |
| extern thread_local aco::monotonic_buffer_resource* instruction_buffer; |
| |
| struct instr_deleter_functor { |
| /* Don't yet free any instructions. They will be de-allocated |
| * all at once after compilation finished. |
| */ |
| void operator()(void* p) { return; } |
| }; |
| |
| template <typename T> using aco_ptr = std::unique_ptr<T, instr_deleter_functor>; |
| |
| template <typename T> |
| T* |
| create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, |
| uint32_t num_definitions) |
| { |
| std::size_t size = |
| sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition); |
| void* data = instruction_buffer->allocate(size, alignof(uint32_t)); |
| memset(data, 0, size); |
| T* inst = (T*)data; |
| |
| inst->opcode = opcode; |
| inst->format = format; |
| |
| uint16_t operands_offset = sizeof(T) - offsetof(Instruction, operands); |
| inst->operands = aco::span<Operand>(operands_offset, num_operands); |
| uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions; |
| inst->definitions = aco::span<Definition>(definitions_offset, num_definitions); |
| |
| return inst; |
| } |
| |
| constexpr bool |
| Instruction::usesModifiers() const noexcept |
| { |
| if (isDPP() || isSDWA()) |
| return true; |
| |
| if (isVOP3P()) { |
| const VOP3P_instruction& vop3p = this->vop3p(); |
| for (unsigned i = 0; i < operands.size(); i++) { |
| if (vop3p.neg_lo[i] || vop3p.neg_hi[i]) |
| return true; |
| |
| /* opsel_hi must be 1 to not be considered a modifier - even for constants */ |
| if (!(vop3p.opsel_hi & (1 << i))) |
| return true; |
| } |
| return vop3p.opsel_lo || vop3p.clamp; |
| } else if (isVOP3()) { |
| const VOP3_instruction& vop3 = this->vop3(); |
| for (unsigned i = 0; i < operands.size(); i++) { |
| if (vop3.abs[i] || vop3.neg[i]) |
| return true; |
| } |
| return vop3.opsel || vop3.clamp || vop3.omod; |
| } |
| return false; |
| } |
| |
| constexpr bool |
| is_phi(Instruction* instr) |
| { |
| return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi; |
| } |
| |
| static inline bool |
| is_phi(aco_ptr<Instruction>& instr) |
| { |
| return is_phi(instr.get()); |
| } |
| |
| memory_sync_info get_sync_info(const Instruction* instr); |
| |
| inline bool |
| is_dead(const std::vector<uint16_t>& uses, const Instruction* instr) |
| { |
| if (instr->definitions.empty() || instr->isBranch() || |
| instr->opcode == aco_opcode::p_init_scratch || |
| instr->opcode == aco_opcode::p_dual_src_export_gfx11) |
| return false; |
| |
| if (std::any_of(instr->definitions.begin(), instr->definitions.end(), |
| [&uses](const Definition& def) { return !def.isTemp() || uses[def.tempId()]; })) |
| return false; |
| |
| return !(get_sync_info(instr).semantics & (semantic_volatile | semantic_acqrel)); |
| } |
| |
| bool can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx); |
| bool instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op); |
| bool can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra); |
| bool can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra, bool dpp8); |
| /* updates "instr" and returns the old instruction (or NULL if no update was needed) */ |
| aco_ptr<Instruction> convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr); |
| aco_ptr<Instruction> convert_to_DPP(aco_ptr<Instruction>& instr, bool dpp8); |
| bool needs_exec_mask(const Instruction* instr); |
| |
| aco_opcode get_ordered(aco_opcode op); |
| aco_opcode get_unordered(aco_opcode op); |
| aco_opcode get_inverse(aco_opcode op); |
| aco_opcode get_f32_cmp(aco_opcode op); |
| aco_opcode get_vcmpx(aco_opcode op); |
| unsigned get_cmp_bitsize(aco_opcode op); |
| bool is_fp_cmp(aco_opcode op); |
| bool is_cmpx(aco_opcode op); |
| |
| bool can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op); |
| |
| uint32_t get_reduction_identity(ReduceOp op, unsigned idx); |
| |
| unsigned get_mimg_nsa_dwords(const Instruction* instr); |
| |
| bool should_form_clause(const Instruction* a, const Instruction* b); |
| |
| enum block_kind { |
| /* uniform indicates that leaving this block, |
| * all actives lanes stay active */ |
| block_kind_uniform = 1 << 0, |
| block_kind_top_level = 1 << 1, |
| block_kind_loop_preheader = 1 << 2, |
| block_kind_loop_header = 1 << 3, |
| block_kind_loop_exit = 1 << 4, |
| block_kind_continue = 1 << 5, |
| block_kind_break = 1 << 6, |
| block_kind_continue_or_break = 1 << 7, |
| block_kind_branch = 1 << 8, |
| block_kind_merge = 1 << 9, |
| block_kind_invert = 1 << 10, |
| block_kind_discard_early_exit = 1 << 11, |
| block_kind_uses_discard = 1 << 12, |
| block_kind_needs_lowering = 1 << 13, |
| block_kind_export_end = 1 << 15, |
| }; |
| |
| struct RegisterDemand { |
| constexpr RegisterDemand() = default; |
| constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept : vgpr{v}, sgpr{s} {} |
| int16_t vgpr = 0; |
| int16_t sgpr = 0; |
| |
| constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept |
| { |
| return a.vgpr == b.vgpr && a.sgpr == b.sgpr; |
| } |
| |
| constexpr bool exceeds(const RegisterDemand other) const noexcept |
| { |
| return vgpr > other.vgpr || sgpr > other.sgpr; |
| } |
| |
| constexpr RegisterDemand operator+(const Temp t) const noexcept |
| { |
| if (t.type() == RegType::sgpr) |
| return RegisterDemand(vgpr, sgpr + t.size()); |
| else |
| return RegisterDemand(vgpr + t.size(), sgpr); |
| } |
| |
| constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept |
| { |
| return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr); |
| } |
| |
| constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept |
| { |
| return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr); |
| } |
| |
| constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept |
| { |
| vgpr += other.vgpr; |
| sgpr += other.sgpr; |
| return *this; |
| } |
| |
| constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept |
| { |
| vgpr -= other.vgpr; |
| sgpr -= other.sgpr; |
| return *this; |
| } |
| |
| constexpr RegisterDemand& operator+=(const Temp t) noexcept |
| { |
| if (t.type() == RegType::sgpr) |
| sgpr += t.size(); |
| else |
| vgpr += t.size(); |
| return *this; |
| } |
| |
| constexpr RegisterDemand& operator-=(const Temp t) noexcept |
| { |
| if (t.type() == RegType::sgpr) |
| sgpr -= t.size(); |
| else |
| vgpr -= t.size(); |
| return *this; |
| } |
| |
| constexpr void update(const RegisterDemand other) noexcept |
| { |
| vgpr = std::max(vgpr, other.vgpr); |
| sgpr = std::max(sgpr, other.sgpr); |
| } |
| }; |
| |
| /* CFG */ |
| struct Block { |
| float_mode fp_mode; |
| unsigned index; |
| unsigned offset = 0; |
| std::vector<aco_ptr<Instruction>> instructions; |
| std::vector<unsigned> logical_preds; |
| std::vector<unsigned> linear_preds; |
| std::vector<unsigned> logical_succs; |
| std::vector<unsigned> linear_succs; |
| RegisterDemand register_demand = RegisterDemand(); |
| uint16_t loop_nest_depth = 0; |
| uint16_t divergent_if_logical_depth = 0; |
| uint16_t uniform_if_depth = 0; |
| uint16_t kind = 0; |
| int logical_idom = -1; |
| int linear_idom = -1; |
| |
| /* this information is needed for predecessors to blocks with phis when |
| * moving out of ssa */ |
| bool scc_live_out = false; |
| |
| Block() : index(0) {} |
| }; |
| |
| /* |
| * Shader stages as provided in Vulkan by the application. Contrast this to HWStage. |
| */ |
| enum class SWStage : uint16_t { |
| None = 0, |
| VS = 1 << 0, /* Vertex Shader */ |
| GS = 1 << 1, /* Geometry Shader */ |
| TCS = 1 << 2, /* Tessellation Control aka Hull Shader */ |
| TES = 1 << 3, /* Tessellation Evaluation aka Domain Shader */ |
| FS = 1 << 4, /* Fragment aka Pixel Shader */ |
| CS = 1 << 5, /* Compute Shader */ |
| TS = 1 << 6, /* Task Shader */ |
| MS = 1 << 7, /* Mesh Shader */ |
| GSCopy = 1 << 8, /* GS Copy Shader (internal) */ |
| |
| /* Stage combinations merged to run on a single HWStage */ |
| VS_GS = VS | GS, |
| VS_TCS = VS | TCS, |
| TES_GS = TES | GS, |
| }; |
| |
| constexpr SWStage |
| operator|(SWStage a, SWStage b) |
| { |
| return static_cast<SWStage>(static_cast<uint16_t>(a) | static_cast<uint16_t>(b)); |
| } |
| |
| /* |
| * Shader stages as running on the AMD GPU. |
| * |
| * The relation between HWStages and SWStages is not a one-to-one mapping: |
| * Some SWStages are merged by ACO to run on a single HWStage. |
| * See README.md for details. |
| */ |
| enum class HWStage : uint8_t { |
| VS, |
| ES, /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */ |
| GS, /* Geometry shader on GFX10/legacy and GFX6-9. */ |
| NGG, /* Primitive shader, used to implement VS, TES, GS. */ |
| LS, /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */ |
| HS, /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */ |
| FS, |
| CS, |
| }; |
| |
| /* |
| * Set of SWStages to be merged into a single shader paired with the |
| * HWStage it will run on. |
| */ |
| struct Stage { |
| constexpr Stage() = default; |
| |
| explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) {} |
| |
| /* Check if the given SWStage is included */ |
| constexpr bool has(SWStage stage) const |
| { |
| return (static_cast<uint16_t>(sw) & static_cast<uint16_t>(stage)); |
| } |
| |
| unsigned num_sw_stages() const { return util_bitcount(static_cast<uint16_t>(sw)); } |
| |
| constexpr bool operator==(const Stage& other) const { return sw == other.sw && hw == other.hw; } |
| |
| constexpr bool operator!=(const Stage& other) const { return sw != other.sw || hw != other.hw; } |
| |
| /* Mask of merged software stages */ |
| SWStage sw = SWStage::None; |
| |
| /* Active hardware stage */ |
| HWStage hw{}; |
| }; |
| |
| /* possible settings of Program::stage */ |
| static constexpr Stage vertex_vs(HWStage::VS, SWStage::VS); |
| static constexpr Stage fragment_fs(HWStage::FS, SWStage::FS); |
| static constexpr Stage compute_cs(HWStage::CS, SWStage::CS); |
| static constexpr Stage tess_eval_vs(HWStage::VS, SWStage::TES); |
| static constexpr Stage gs_copy_vs(HWStage::VS, SWStage::GSCopy); |
| /* Mesh shading pipeline */ |
| static constexpr Stage task_cs(HWStage::CS, SWStage::TS); |
| static constexpr Stage mesh_ngg(HWStage::NGG, SWStage::MS); |
| /* GFX10/NGG */ |
| static constexpr Stage vertex_ngg(HWStage::NGG, SWStage::VS); |
| static constexpr Stage vertex_geometry_ngg(HWStage::NGG, SWStage::VS_GS); |
| static constexpr Stage tess_eval_ngg(HWStage::NGG, SWStage::TES); |
| static constexpr Stage tess_eval_geometry_ngg(HWStage::NGG, SWStage::TES_GS); |
| /* GFX9 (and GFX10 if NGG isn't used) */ |
| static constexpr Stage vertex_geometry_gs(HWStage::GS, SWStage::VS_GS); |
| static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS); |
| static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS); |
| /* pre-GFX9 */ |
| static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */ |
| static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */ |
| static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS); |
| static constexpr Stage tess_eval_es(HWStage::ES, |
| SWStage::TES); /* tesselation evaluation before geometry */ |
| static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS); |
| |
| enum statistic { |
| statistic_hash, |
| statistic_instructions, |
| statistic_copies, |
| statistic_branches, |
| statistic_latency, |
| statistic_inv_throughput, |
| statistic_vmem_clauses, |
| statistic_smem_clauses, |
| statistic_sgpr_presched, |
| statistic_vgpr_presched, |
| num_statistics |
| }; |
| |
| struct DeviceInfo { |
| uint16_t lds_encoding_granule; |
| uint16_t lds_alloc_granule; |
| uint32_t lds_limit; /* in bytes */ |
| bool has_16bank_lds; |
| uint16_t physical_sgprs; |
| uint16_t physical_vgprs; |
| uint16_t vgpr_limit; |
| uint16_t sgpr_limit; |
| uint16_t sgpr_alloc_granule; |
| uint16_t vgpr_alloc_granule; |
| unsigned max_wave64_per_simd; |
| unsigned simd_per_cu; |
| bool has_fast_fma32 = false; |
| bool has_mac_legacy32 = false; |
| bool fused_mad_mix = false; |
| bool xnack_enabled = false; |
| bool sram_ecc_enabled = false; |
| |
| int16_t scratch_global_offset_min; |
| int16_t scratch_global_offset_max; |
| }; |
| |
| enum class CompilationProgress { |
| after_isel, |
| after_spilling, |
| after_ra, |
| }; |
| |
| class Program final { |
| public: |
| aco::monotonic_buffer_resource m{65536}; |
| std::vector<Block> blocks; |
| std::vector<RegClass> temp_rc = {s1}; |
| RegisterDemand max_reg_demand = RegisterDemand(); |
| ac_shader_config* config; |
| struct aco_shader_info info; |
| enum amd_gfx_level gfx_level; |
| enum radeon_family family; |
| DeviceInfo dev; |
| unsigned wave_size; |
| RegClass lane_mask; |
| Stage stage; |
| bool needs_exact = false; /* there exists an instruction with disable_wqm = true */ |
| bool needs_wqm = false; /* there exists a p_wqm instruction */ |
| bool has_color_exports = false; |
| |
| std::vector<uint8_t> constant_data; |
| Temp private_segment_buffer; |
| Temp scratch_offset; |
| |
| uint16_t num_waves = 0; |
| uint16_t min_waves = 0; |
| unsigned workgroup_size; /* if known; otherwise UINT_MAX */ |
| bool wgp_mode; |
| bool early_rast = false; /* whether rasterization can start as soon as the 1st DONE pos export */ |
| |
| bool needs_vcc = false; |
| |
| CompilationProgress progress; |
| |
| bool collect_statistics = false; |
| uint32_t statistics[num_statistics]; |
| |
| float_mode next_fp_mode; |
| unsigned next_loop_depth = 0; |
| unsigned next_divergent_if_logical_depth = 0; |
| unsigned next_uniform_if_depth = 0; |
| |
| std::vector<Definition> vs_inputs; |
| |
| struct { |
| FILE* output = stderr; |
| bool shorten_messages = false; |
| void (*func)(void* private_data, enum aco_compiler_debug_level level, const char* message); |
| void* private_data; |
| } debug; |
| |
| uint32_t allocateId(RegClass rc) |
| { |
| assert(allocationID <= 16777215); |
| temp_rc.push_back(rc); |
| return allocationID++; |
| } |
| |
| void allocateRange(unsigned amount) |
| { |
| assert(allocationID + amount <= 16777216); |
| temp_rc.resize(temp_rc.size() + amount); |
| allocationID += amount; |
| } |
| |
| Temp allocateTmp(RegClass rc) { return Temp(allocateId(rc), rc); } |
| |
| uint32_t peekAllocationId() { return allocationID; } |
| |
| friend void reindex_ssa(Program* program); |
| friend void reindex_ssa(Program* program, std::vector<IDSet>& live_out); |
| |
| Block* create_and_insert_block() |
| { |
| Block block; |
| return insert_block(std::move(block)); |
| } |
| |
| Block* insert_block(Block&& block) |
| { |
| block.index = blocks.size(); |
| block.fp_mode = next_fp_mode; |
| block.loop_nest_depth = next_loop_depth; |
| block.divergent_if_logical_depth = next_divergent_if_logical_depth; |
| block.uniform_if_depth = next_uniform_if_depth; |
| blocks.emplace_back(std::move(block)); |
| return &blocks.back(); |
| } |
| |
| private: |
| uint32_t allocationID = 1; |
| }; |
| |
| struct live { |
| /* live temps out per block */ |
| std::vector<IDSet> live_out; |
| /* register demand (sgpr/vgpr) per instruction per block */ |
| std::vector<std::vector<RegisterDemand>> register_demand; |
| }; |
| |
| struct ra_test_policy { |
| /* Force RA to always use its pessimistic fallback algorithm */ |
| bool skip_optimistic_path = false; |
| }; |
| |
| void init(); |
| |
| void init_program(Program* program, Stage stage, const struct aco_shader_info* info, |
| enum amd_gfx_level gfx_level, enum radeon_family family, bool wgp_mode, |
| ac_shader_config* config); |
| |
| void select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders, |
| ac_shader_config* config, const struct aco_compiler_options* options, |
| const struct aco_shader_info* info, |
| const struct radv_shader_args* args); |
| void select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config, |
| const struct aco_compiler_options* options, |
| const struct aco_shader_info* info, |
| const struct radv_shader_args* args); |
| void select_trap_handler_shader(Program* program, struct nir_shader* shader, |
| ac_shader_config* config, |
| const struct aco_compiler_options* options, |
| const struct aco_shader_info* info, |
| const struct radv_shader_args* args); |
| void select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key, |
| ac_shader_config* config, |
| const struct aco_compiler_options* options, |
| const struct aco_shader_info* info, |
| const struct radv_shader_args* args, |
| unsigned* num_preserved_sgprs); |
| |
| void select_ps_epilog(Program* program, const struct aco_ps_epilog_key* key, |
| ac_shader_config* config, |
| const struct aco_compiler_options* options, |
| const struct aco_shader_info* info, |
| const struct radv_shader_args* args); |
| |
| void lower_phis(Program* program); |
| void calc_min_waves(Program* program); |
| void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand); |
| live live_var_analysis(Program* program); |
| std::vector<uint16_t> dead_code_analysis(Program* program); |
| void dominator_tree(Program* program); |
| void insert_exec_mask(Program* program); |
| void value_numbering(Program* program); |
| void optimize(Program* program); |
| void optimize_postRA(Program* program); |
| void setup_reduce_temp(Program* program); |
| void lower_to_cssa(Program* program, live& live_vars); |
| void register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, |
| ra_test_policy = {}); |
| void ssa_elimination(Program* program); |
| void lower_to_hw_instr(Program* program); |
| void schedule_program(Program* program, live& live_vars); |
| void spill(Program* program, live& live_vars); |
| void insert_wait_states(Program* program); |
| bool dealloc_vgprs(Program* program); |
| void insert_NOPs(Program* program); |
| void form_hard_clauses(Program* program); |
| unsigned emit_program(Program* program, std::vector<uint32_t>& code); |
| /** |
| * Returns true if print_asm can disassemble the given program for the current build/runtime |
| * configuration |
| */ |
| bool check_print_asm_support(Program* program); |
| bool print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output); |
| bool validate_ir(Program* program); |
| bool validate_ra(Program* program); |
| #ifndef NDEBUG |
| void perfwarn(Program* program, bool cond, const char* msg, Instruction* instr = NULL); |
| #else |
| #define perfwarn(program, cond, msg, ...) \ |
| do { \ |
| } while (0) |
| #endif |
| |
| void collect_presched_stats(Program* program); |
| void collect_preasm_stats(Program* program); |
| void collect_postasm_stats(Program* program, const std::vector<uint32_t>& code); |
| |
| enum print_flags { |
| print_no_ssa = 0x1, |
| print_perf_info = 0x2, |
| print_kill = 0x4, |
| print_live_vars = 0x8, |
| }; |
| |
| void aco_print_operand(const Operand* operand, FILE* output, unsigned flags = 0); |
| void aco_print_instr(enum amd_gfx_level gfx_level, const Instruction* instr, FILE* output, |
| unsigned flags = 0); |
| void aco_print_program(const Program* program, FILE* output, unsigned flags = 0); |
| void aco_print_program(const Program* program, FILE* output, const live& live_vars, |
| unsigned flags = 0); |
| |
| void _aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...); |
| void _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...); |
| |
| #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__) |
| #define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__) |
| |
| /* utilities for dealing with register demand */ |
| RegisterDemand get_live_changes(aco_ptr<Instruction>& instr); |
| RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr); |
| RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, |
| aco_ptr<Instruction>& instr_before); |
| |
| /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */ |
| uint16_t get_extra_sgprs(Program* program); |
| |
| /* adjust num_waves for workgroup size and LDS limits */ |
| uint16_t max_suitable_waves(Program* program, uint16_t waves); |
| |
| /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */ |
| uint16_t get_sgpr_alloc(Program* program, uint16_t addressable_sgprs); |
| uint16_t get_vgpr_alloc(Program* program, uint16_t addressable_vgprs); |
| |
| /* return number of addressable sgprs/vgprs for max_waves */ |
| uint16_t get_addr_sgpr_from_waves(Program* program, uint16_t max_waves); |
| uint16_t get_addr_vgpr_from_waves(Program* program, uint16_t max_waves); |
| |
| typedef struct { |
| const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)]; |
| const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)]; |
| const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)]; |
| const int16_t opcode_gfx11[static_cast<int>(aco_opcode::num_opcodes)]; |
| const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers; |
| const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers; |
| const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic; |
| const char* name[static_cast<int>(aco_opcode::num_opcodes)]; |
| const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)]; |
| /* sizes used for input/output modifiers and constants */ |
| const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)]; |
| const instr_class classes[static_cast<int>(aco_opcode::num_opcodes)]; |
| } Info; |
| |
| extern const Info instr_info; |
| |
| } // namespace aco |
| |
| #endif /* ACO_IR_H */ |