blob: 5f98a3d663414bd1c0dd4ed9ef6162d10679a08f [file] [log] [blame] [edit]
// Copyright © 2022 Collabora, Ltd.
// SPDX-License-Identifier: MIT
extern crate bitview;
extern crate nak_ir_proc;
use bitview::{BitMutView, BitMutViewable, BitView, BitViewable, SetField};
use nak_bindings::*;
pub use crate::builder::{Builder, InstrBuilder, SSABuilder, SSAInstrBuilder};
use crate::legalize::LegalizeBuilder;
use crate::sph::{OutputTopology, PixelImap};
pub use crate::ssa_value::*;
use compiler::as_slice::*;
use compiler::cfg::CFG;
use compiler::smallvec::SmallVec;
use nak_ir_proc::*;
use std::cmp::{max, min};
use std::fmt;
use std::fmt::Write;
use std::iter::Zip;
use std::ops::{BitAnd, BitOr, Deref, DerefMut, Index, IndexMut, Not, Range};
use std::slice;
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub struct Label {
idx: u32,
}
impl fmt::Display for Label {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "L{}", self.idx)
}
}
pub struct LabelAllocator {
count: u32,
}
impl LabelAllocator {
pub fn new() -> LabelAllocator {
LabelAllocator { count: 0 }
}
pub fn alloc(&mut self) -> Label {
let idx = self.count;
self.count += 1;
Label { idx: idx }
}
}
/// Represents a register file
#[repr(u8)]
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
pub enum RegFile {
/// The general-purpose register file
///
/// General-purpose registers are 32 bits per SIMT channel.
GPR = 0,
/// The general-purpose uniform register file
///
/// General-purpose uniform registers are 32 bits each and uniform across a
/// wave.
UGPR = 1,
/// The predicate reigster file
///
/// Predicate registers are 1 bit per SIMT channel.
Pred = 2,
/// The uniform predicate reigster file
///
/// Uniform predicate registers are 1 bit and uniform across a wave.
UPred = 3,
/// The carry flag register file
///
/// Only one carry flag register exists in hardware, but representing it as
/// a reg file simplifies dependency tracking.
///
/// This is used only on SM50.
Carry = 4,
/// The barrier register file
///
/// This is a lane mask used for wave re-convergence instructions.
Bar = 5,
/// The memory register file
///
/// This is a virtual register file for things which will get spilled to
/// local memory. Each memory location is 32 bits per SIMT channel.
Mem = 6,
}
const NUM_REG_FILES: usize = 7;
impl RegFile {
/// Returns true if the register file is uniform across a wave.
pub fn is_uniform(&self) -> bool {
match self {
RegFile::GPR
| RegFile::Pred
| RegFile::Carry
| RegFile::Bar
| RegFile::Mem => false,
RegFile::UGPR | RegFile::UPred => true,
}
}
/// Returns the uniform form of this register file, if any. For `GPR` and
/// `UGPR, this returns `UGPR` and for `Pred` and `UPred`, this returns
/// `UPred`.
pub fn to_uniform(self) -> Option<RegFile> {
match self {
RegFile::GPR | RegFile::UGPR => Some(RegFile::UGPR),
RegFile::Pred | RegFile::UPred => Some(RegFile::UPred),
RegFile::Carry | RegFile::Bar | RegFile::Mem => None,
}
}
/// Returns warp-wide version of this register file.
pub fn to_warp(self) -> RegFile {
match self {
RegFile::GPR | RegFile::UGPR => RegFile::GPR,
RegFile::Pred | RegFile::UPred => RegFile::Pred,
RegFile::Carry | RegFile::Bar | RegFile::Mem => self,
}
}
/// Returns true if the register file is GPR or UGPR.
pub fn is_gpr(&self) -> bool {
match self {
RegFile::GPR | RegFile::UGPR => true,
RegFile::Pred
| RegFile::UPred
| RegFile::Carry
| RegFile::Bar
| RegFile::Mem => false,
}
}
/// Returns true if the register file is a predicate register file.
pub fn is_predicate(&self) -> bool {
match self {
RegFile::GPR
| RegFile::UGPR
| RegFile::Carry
| RegFile::Bar
| RegFile::Mem => false,
RegFile::Pred | RegFile::UPred => true,
}
}
pub fn fmt_prefix(&self) -> &'static str {
match self {
RegFile::GPR => "r",
RegFile::UGPR => "ur",
RegFile::Pred => "p",
RegFile::UPred => "up",
RegFile::Carry => "c",
RegFile::Bar => "b",
RegFile::Mem => "m",
}
}
}
impl fmt::Display for RegFile {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
RegFile::GPR => write!(f, "GPR"),
RegFile::UGPR => write!(f, "UGPR"),
RegFile::Pred => write!(f, "Pred"),
RegFile::UPred => write!(f, "UPred"),
RegFile::Carry => write!(f, "Carry"),
RegFile::Bar => write!(f, "Bar"),
RegFile::Mem => write!(f, "Mem"),
}
}
}
impl From<RegFile> for u8 {
fn from(value: RegFile) -> u8 {
value as u8
}
}
impl TryFrom<u32> for RegFile {
type Error = &'static str;
fn try_from(value: u32) -> Result<Self, Self::Error> {
match value {
0 => Ok(RegFile::GPR),
1 => Ok(RegFile::UGPR),
2 => Ok(RegFile::Pred),
3 => Ok(RegFile::UPred),
4 => Ok(RegFile::Carry),
5 => Ok(RegFile::Bar),
6 => Ok(RegFile::Mem),
_ => Err("Invalid register file number"),
}
}
}
impl TryFrom<u16> for RegFile {
type Error = &'static str;
fn try_from(value: u16) -> Result<Self, Self::Error> {
RegFile::try_from(u32::from(value))
}
}
impl TryFrom<u8> for RegFile {
type Error = &'static str;
fn try_from(value: u8) -> Result<Self, Self::Error> {
RegFile::try_from(u32::from(value))
}
}
/// A trait for things which have an associated register file
pub trait HasRegFile {
fn file(&self) -> RegFile;
fn is_uniform(&self) -> bool {
self.file().is_uniform()
}
fn is_gpr(&self) -> bool {
self.file().is_gpr()
}
fn is_predicate(&self) -> bool {
self.file().is_predicate()
}
}
impl HasRegFile for &[SSAValue] {
fn file(&self) -> RegFile {
let comps = self.len();
let file = self[0].file();
for i in 1..comps {
if self[i].file() != file {
panic!("Illegal mix of RegFiles")
}
}
file
}
}
#[derive(Clone)]
pub struct RegFileSet {
bits: u8,
}
impl RegFileSet {
pub fn new() -> RegFileSet {
RegFileSet { bits: 0 }
}
pub fn len(&self) -> usize {
self.bits.count_ones() as usize
}
pub fn contains(&self, file: RegFile) -> bool {
self.bits & (1 << (file as u8)) != 0
}
pub fn insert(&mut self, file: RegFile) -> bool {
let has_file = self.contains(file);
self.bits |= 1 << (file as u8);
!has_file
}
pub fn is_empty(&self) -> bool {
self.bits == 0
}
#[allow(dead_code)]
pub fn iter(&self) -> RegFileSet {
self.clone()
}
pub fn remove(&mut self, file: RegFile) -> bool {
let has_file = self.contains(file);
self.bits &= !(1 << (file as u8));
has_file
}
}
impl FromIterator<RegFile> for RegFileSet {
fn from_iter<T: IntoIterator<Item = RegFile>>(iter: T) -> Self {
let mut set = RegFileSet::new();
for file in iter {
set.insert(file);
}
set
}
}
impl Iterator for RegFileSet {
type Item = RegFile;
fn next(&mut self) -> Option<RegFile> {
if self.is_empty() {
None
} else {
let file = self.bits.trailing_zeros().try_into().unwrap();
self.remove(file);
Some(file)
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
let len = self.len();
(len, Some(len))
}
}
/// A container mapping register files to items.
///
/// This is used by several passes which need to replicate a data structure
/// per-register-file.
#[derive(Clone, Copy)]
pub struct PerRegFile<T> {
per_file: [T; NUM_REG_FILES],
}
impl<T> PerRegFile<T> {
/// Creates a new per-register-file container.
///
/// Because this container assumes it always has an item for each register
/// file, it takes a callback which maps register files to initial values
/// to avoid adding a bunch of `Option<T>` or requiring `T` to implement
/// `Default`. If `T` does implement `Default`, then so does
/// `PerRefFile<T>`.
pub fn new_with<F: Fn(RegFile) -> T>(f: F) -> Self {
PerRegFile {
per_file: [
f(RegFile::GPR),
f(RegFile::UGPR),
f(RegFile::Pred),
f(RegFile::UPred),
f(RegFile::Carry),
f(RegFile::Bar),
f(RegFile::Mem),
],
}
}
/// Iterates over the values in this container.
pub fn values(&self) -> slice::Iter<'_, T> {
self.per_file.iter()
}
/// Iterates over the mutable values in this container.
pub fn values_mut(&mut self) -> slice::IterMut<'_, T> {
self.per_file.iter_mut()
}
}
impl<T: Default> Default for PerRegFile<T> {
fn default() -> Self {
PerRegFile {
per_file: Default::default(),
}
}
}
impl<T> Index<RegFile> for PerRegFile<T> {
type Output = T;
fn index(&self, idx: RegFile) -> &T {
&self.per_file[idx as u8 as usize]
}
}
impl<T> IndexMut<RegFile> for PerRegFile<T> {
fn index_mut(&mut self, idx: RegFile) -> &mut T {
&mut self.per_file[idx as u8 as usize]
}
}
/// A reference to a contiguous range of registers in a particular register
/// file.
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub struct RegRef {
packed: u32,
}
impl RegRef {
pub const MAX_IDX: u32 = (1 << 26) - 1;
/// Creates a new register reference.
///
/// # Panics
///
/// This method panics if `base_idx > RegRef::MAX_IDX` or if `comps > 8`.
pub fn new(file: RegFile, base_idx: u32, comps: u8) -> RegRef {
assert!(base_idx <= Self::MAX_IDX);
let mut packed = base_idx;
assert!(comps > 0 && comps <= 8);
packed |= u32::from(comps - 1) << 26;
assert!(u8::from(file) < 8);
packed |= u32::from(u8::from(file)) << 29;
RegRef { packed: packed }
}
/// Returns the index of the first register referenced.
pub fn base_idx(&self) -> u32 {
self.packed & 0x03ffffff
}
/// Returns the range of register indices referenced.
pub fn idx_range(&self) -> Range<u32> {
let start = self.base_idx();
let end = start + u32::from(self.comps());
start..end
}
/// Returns the number of registers referenced.
pub fn comps(&self) -> u8 {
(((self.packed >> 26) & 0x7) + 1).try_into().unwrap()
}
/// Returns a reference to the single register at `base_idx() + c`.
pub fn comp(&self, c: u8) -> RegRef {
assert!(c < self.comps());
RegRef::new(self.file(), self.base_idx() + u32::from(c), 1)
}
}
impl HasRegFile for RegRef {
fn file(&self) -> RegFile {
((self.packed >> 29) & 0x7).try_into().unwrap()
}
}
impl fmt::Display for RegRef {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}{}", self.file().fmt_prefix(), self.base_idx())?;
if self.comps() > 1 {
write!(f, "..{}", self.idx_range().end)?;
}
Ok(())
}
}
#[derive(Clone)]
pub enum Dst {
None,
SSA(SSARef),
Reg(RegRef),
}
impl Dst {
pub fn is_none(&self) -> bool {
matches!(self, Dst::None)
}
pub fn as_reg(&self) -> Option<&RegRef> {
match self {
Dst::Reg(r) => Some(r),
_ => None,
}
}
pub fn as_ssa(&self) -> Option<&SSARef> {
match self {
Dst::SSA(r) => Some(r),
_ => None,
}
}
#[allow(dead_code)]
pub fn to_ssa(self) -> SSARef {
match self {
Dst::SSA(r) => r,
_ => panic!("Expected ssa"),
}
}
pub fn iter_ssa(&self) -> slice::Iter<'_, SSAValue> {
match self {
Dst::None | Dst::Reg(_) => &[],
Dst::SSA(ssa) => ssa.deref(),
}
.iter()
}
pub fn iter_ssa_mut(&mut self) -> slice::IterMut<'_, SSAValue> {
match self {
Dst::None | Dst::Reg(_) => &mut [],
Dst::SSA(ssa) => ssa.deref_mut(),
}
.iter_mut()
}
}
impl From<RegRef> for Dst {
fn from(reg: RegRef) -> Dst {
Dst::Reg(reg)
}
}
impl<T: Into<SSARef>> From<T> for Dst {
fn from(ssa: T) -> Dst {
Dst::SSA(ssa.into())
}
}
impl From<Option<SSAValue>> for Dst {
fn from(ssa: Option<SSAValue>) -> Dst {
match ssa {
Some(ssa) => Dst::SSA(ssa.into()),
None => Dst::None,
}
}
}
impl fmt::Display for Dst {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Dst::None => write!(f, "null")?,
Dst::SSA(v) => v.fmt(f)?,
Dst::Reg(r) => r.fmt(f)?,
}
Ok(())
}
}
#[derive(Clone, Eq, Hash, PartialEq)]
pub enum CBuf {
Binding(u8),
#[allow(dead_code)]
BindlessSSA([SSAValue; 2]),
#[allow(dead_code)]
BindlessUGPR(RegRef),
}
impl fmt::Display for CBuf {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
CBuf::Binding(idx) => write!(f, "c[{:#x}]", idx),
CBuf::BindlessSSA(v) => write!(f, "cx[{{{}, {}}}]", v[0], v[1]),
CBuf::BindlessUGPR(r) => write!(f, "cx[{}]", r),
}
}
}
#[derive(Clone, Eq, Hash, PartialEq)]
pub struct CBufRef {
pub buf: CBuf,
pub offset: u16,
}
impl CBufRef {
pub fn offset(self, offset: u16) -> CBufRef {
CBufRef {
buf: self.buf,
offset: self.offset + offset,
}
}
}
impl fmt::Display for CBufRef {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}[{:#x}]", self.buf, self.offset)
}
}
#[derive(Clone, Eq, Hash, PartialEq)]
pub enum SrcRef {
Zero,
True,
False,
Imm32(u32),
CBuf(CBufRef),
SSA(SSARef),
Reg(RegRef),
}
impl SrcRef {
#[allow(dead_code)]
pub fn is_alu(&self) -> bool {
match self {
SrcRef::Zero | SrcRef::Imm32(_) | SrcRef::CBuf(_) => true,
SrcRef::SSA(ssa) => ssa.is_gpr(),
SrcRef::Reg(reg) => reg.is_gpr(),
SrcRef::True | SrcRef::False => false,
}
}
pub fn is_bindless_cbuf(&self) -> bool {
match self {
SrcRef::CBuf(cbuf) => {
matches!(cbuf.buf, CBuf::BindlessSSA(_) | CBuf::BindlessUGPR(_))
}
_ => false,
}
}
pub fn is_predicate(&self) -> bool {
match self {
SrcRef::Zero | SrcRef::Imm32(_) | SrcRef::CBuf(_) => false,
SrcRef::True | SrcRef::False => true,
SrcRef::SSA(ssa) => ssa.is_predicate(),
SrcRef::Reg(reg) => reg.is_predicate(),
}
}
pub fn is_carry(&self) -> bool {
match self {
SrcRef::SSA(ssa) => ssa.file() == RegFile::Carry,
SrcRef::Reg(reg) => reg.file() == RegFile::Carry,
_ => false,
}
}
#[allow(dead_code)]
pub fn is_barrier(&self) -> bool {
match self {
SrcRef::SSA(ssa) => ssa.file() == RegFile::Bar,
SrcRef::Reg(reg) => reg.file() == RegFile::Bar,
_ => false,
}
}
pub fn as_reg(&self) -> Option<&RegRef> {
match self {
SrcRef::Reg(r) => Some(r),
_ => None,
}
}
pub fn as_ssa(&self) -> Option<&SSARef> {
match self {
SrcRef::SSA(r) => Some(r),
_ => None,
}
}
pub fn to_ssa(self) -> SSARef {
match self {
SrcRef::SSA(r) => r,
_ => panic!(),
}
}
pub fn as_u32(&self) -> Option<u32> {
match self {
SrcRef::Zero => Some(0),
SrcRef::Imm32(u) => Some(*u),
SrcRef::CBuf(_) | SrcRef::SSA(_) | SrcRef::Reg(_) => None,
_ => panic!("Invalid integer source"),
}
}
pub fn get_reg(&self) -> Option<&RegRef> {
match self {
SrcRef::Zero
| SrcRef::True
| SrcRef::False
| SrcRef::Imm32(_)
| SrcRef::SSA(_) => None,
SrcRef::CBuf(cb) => match &cb.buf {
CBuf::Binding(_) | CBuf::BindlessSSA(_) => None,
CBuf::BindlessUGPR(reg) => Some(reg),
},
SrcRef::Reg(reg) => Some(reg),
}
}
pub fn iter_ssa(&self) -> slice::Iter<'_, SSAValue> {
match self {
SrcRef::Zero
| SrcRef::True
| SrcRef::False
| SrcRef::Imm32(_)
| SrcRef::Reg(_) => &[],
SrcRef::CBuf(cb) => match &cb.buf {
CBuf::Binding(_) | CBuf::BindlessUGPR(_) => &[],
CBuf::BindlessSSA(ssa) => &ssa[..],
},
SrcRef::SSA(ssa) => ssa.deref(),
}
.iter()
}
pub fn iter_ssa_mut(&mut self) -> slice::IterMut<'_, SSAValue> {
match self {
SrcRef::Zero
| SrcRef::True
| SrcRef::False
| SrcRef::Imm32(_)
| SrcRef::Reg(_) => &mut [],
SrcRef::CBuf(cb) => match &mut cb.buf {
CBuf::Binding(_) | CBuf::BindlessUGPR(_) => &mut [],
CBuf::BindlessSSA(ssa) => &mut ssa[..],
},
SrcRef::SSA(ssa) => ssa.deref_mut(),
}
.iter_mut()
}
}
impl From<bool> for SrcRef {
fn from(b: bool) -> SrcRef {
if b {
SrcRef::True
} else {
SrcRef::False
}
}
}
impl From<u32> for SrcRef {
fn from(u: u32) -> SrcRef {
if u == 0 {
SrcRef::Zero
} else {
SrcRef::Imm32(u)
}
}
}
impl From<f32> for SrcRef {
fn from(f: f32) -> SrcRef {
f.to_bits().into()
}
}
impl From<PrmtSel> for SrcRef {
fn from(sel: PrmtSel) -> SrcRef {
u32::from(sel.0).into()
}
}
impl From<CBufRef> for SrcRef {
fn from(cb: CBufRef) -> SrcRef {
SrcRef::CBuf(cb)
}
}
impl From<RegRef> for SrcRef {
fn from(reg: RegRef) -> SrcRef {
SrcRef::Reg(reg)
}
}
impl<T: Into<SSARef>> From<T> for SrcRef {
fn from(ssa: T) -> SrcRef {
SrcRef::SSA(ssa.into())
}
}
impl From<PredRef> for SrcRef {
fn from(value: PredRef) -> Self {
match value {
PredRef::None => SrcRef::True,
PredRef::Reg(reg) => SrcRef::Reg(reg),
PredRef::SSA(ssa) => SrcRef::SSA(ssa.into()),
}
}
}
impl fmt::Display for SrcRef {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
SrcRef::Zero => write!(f, "rZ"),
SrcRef::True => write!(f, "pT"),
SrcRef::False => write!(f, "pF"),
SrcRef::Imm32(u) => write!(f, "{:#x}", u),
SrcRef::CBuf(c) => c.fmt(f),
SrcRef::SSA(v) => v.fmt(f),
SrcRef::Reg(r) => r.fmt(f),
}
}
}
#[derive(Clone, Copy, PartialEq)]
pub enum SrcMod {
None,
FAbs,
FNeg,
FNegAbs,
INeg,
BNot,
}
impl SrcMod {
pub fn is_none(&self) -> bool {
matches!(self, SrcMod::None)
}
pub fn has_fabs(&self) -> bool {
match self {
SrcMod::None | SrcMod::FNeg => false,
SrcMod::FAbs | SrcMod::FNegAbs => true,
_ => panic!("Not a float modifier"),
}
}
pub fn has_fneg(&self) -> bool {
match self {
SrcMod::None | SrcMod::FAbs => false,
SrcMod::FNeg | SrcMod::FNegAbs => true,
_ => panic!("Not a float modifier"),
}
}
pub fn is_ineg(&self) -> bool {
match self {
SrcMod::None => false,
SrcMod::INeg => true,
_ => panic!("Not an integer modifier"),
}
}
pub fn is_bnot(&self) -> bool {
match self {
SrcMod::None => false,
SrcMod::BNot => true,
_ => panic!("Not a bitwise modifier"),
}
}
pub fn fabs(self) -> SrcMod {
match self {
SrcMod::None | SrcMod::FAbs | SrcMod::FNeg | SrcMod::FNegAbs => {
SrcMod::FAbs
}
_ => panic!("Not a float source modifier"),
}
}
pub fn fneg(self) -> SrcMod {
match self {
SrcMod::None => SrcMod::FNeg,
SrcMod::FAbs => SrcMod::FNegAbs,
SrcMod::FNeg => SrcMod::None,
SrcMod::FNegAbs => SrcMod::FAbs,
_ => panic!("Not a float source modifier"),
}
}
pub fn ineg(self) -> SrcMod {
match self {
SrcMod::None => SrcMod::INeg,
SrcMod::INeg => SrcMod::None,
_ => panic!("Not an integer source modifier"),
}
}
pub fn bnot(self) -> SrcMod {
match self {
SrcMod::None => SrcMod::BNot,
SrcMod::BNot => SrcMod::None,
_ => panic!("Not a boolean source modifier"),
}
}
pub fn modify(self, other: SrcMod) -> SrcMod {
match other {
SrcMod::None => self,
SrcMod::FAbs => self.fabs(),
SrcMod::FNeg => self.fneg(),
SrcMod::FNegAbs => self.fabs().fneg(),
SrcMod::INeg => self.ineg(),
SrcMod::BNot => self.bnot(),
}
}
}
#[derive(Clone, Copy, PartialEq)]
#[allow(dead_code)]
pub enum SrcSwizzle {
None,
Xx,
Yy,
}
impl SrcSwizzle {
pub fn is_none(&self) -> bool {
matches!(self, SrcSwizzle::None)
}
}
impl fmt::Display for SrcSwizzle {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
SrcSwizzle::None => Ok(()),
SrcSwizzle::Xx => write!(f, ".xx"),
SrcSwizzle::Yy => write!(f, ".yy"),
}
}
}
#[derive(Clone, PartialEq)]
pub struct Src {
pub src_ref: SrcRef,
pub src_mod: SrcMod,
pub src_swizzle: SrcSwizzle,
}
impl Src {
pub const ZERO: Src = Src {
src_ref: SrcRef::Zero,
src_mod: SrcMod::None,
src_swizzle: SrcSwizzle::None,
};
pub fn new_imm_u32(u: u32) -> Src {
u.into()
}
pub fn new_imm_bool(b: bool) -> Src {
b.into()
}
pub fn is_unmodified(&self) -> bool {
self.src_mod.is_none() && self.src_swizzle.is_none()
}
pub fn fabs(self) -> Src {
Src {
src_ref: self.src_ref,
src_mod: self.src_mod.fabs(),
src_swizzle: self.src_swizzle,
}
}
pub fn fneg(self) -> Src {
Src {
src_ref: self.src_ref,
src_mod: self.src_mod.fneg(),
src_swizzle: self.src_swizzle,
}
}
pub fn ineg(self) -> Src {
Src {
src_ref: self.src_ref,
src_mod: self.src_mod.ineg(),
src_swizzle: self.src_swizzle,
}
}
pub fn bnot(self) -> Src {
Src {
src_ref: self.src_ref,
src_mod: self.src_mod.bnot(),
src_swizzle: self.src_swizzle,
}
}
pub fn modify(mut self, src_mod: SrcMod) -> Src {
self.src_mod = self.src_mod.modify(src_mod);
self
}
pub fn as_u32(&self, src_type: SrcType) -> Option<u32> {
let u = match &self.src_ref {
SrcRef::Zero => 0,
SrcRef::Imm32(u) => *u,
_ => return None,
};
if self.is_unmodified() {
return Some(u);
}
assert!(src_type == SrcType::F16v2 || self.src_swizzle.is_none());
// INeg affects more than just the 32 bits of input data so it can't be
// trivially folded. In fact, -imm may not be representable as a 32-bit
// immediate at all.
if src_type == SrcType::I32 {
return None;
}
Some(match src_type {
SrcType::F16 => {
let low = u & 0xFFFF;
match self.src_mod {
SrcMod::None => low,
SrcMod::FAbs => low & !(1_u32 << 15),
SrcMod::FNeg => low ^ (1_u32 << 15),
SrcMod::FNegAbs => low | (1_u32 << 15),
_ => panic!("Not a float source modifier"),
}
}
SrcType::F16v2 => {
let u = match self.src_swizzle {
SrcSwizzle::None => u,
SrcSwizzle::Xx => (u << 16) | (u & 0xffff),
SrcSwizzle::Yy => (u & 0xffff0000) | (u >> 16),
};
match self.src_mod {
SrcMod::None => u,
SrcMod::FAbs => u & 0x7FFF7FFF,
SrcMod::FNeg => u ^ 0x80008000,
SrcMod::FNegAbs => u | 0x80008000,
_ => panic!("Not a float source modifier"),
}
}
SrcType::F32 | SrcType::F64 => match self.src_mod {
SrcMod::None => u,
SrcMod::FAbs => u & !(1_u32 << 31),
SrcMod::FNeg => u ^ (1_u32 << 31),
SrcMod::FNegAbs => u | (1_u32 << 31),
_ => panic!("Not a float source modifier"),
},
SrcType::I32 => match self.src_mod {
SrcMod::None => u,
SrcMod::INeg => -(u as i32) as u32,
_ => panic!("Not an integer source modifier"),
},
SrcType::B32 => match self.src_mod {
SrcMod::None => u,
SrcMod::BNot => !u,
_ => panic!("Not a bitwise source modifier"),
},
_ => {
assert!(self.is_unmodified());
u
}
})
}
pub fn as_ssa(&self) -> Option<&SSARef> {
if self.is_unmodified() {
self.src_ref.as_ssa()
} else {
None
}
}
pub fn to_ssa(self) -> SSARef {
if self.is_unmodified() {
self.src_ref.to_ssa()
} else {
panic!("Did not expect src_mod");
}
}
pub fn as_bool(&self) -> Option<bool> {
match &self.src_ref {
SrcRef::True => Some(!self.src_mod.is_bnot()),
SrcRef::False => Some(self.src_mod.is_bnot()),
SrcRef::SSA(vec) => {
assert!(vec.is_predicate() && vec.comps() == 1);
None
}
SrcRef::Reg(reg) => {
assert!(reg.is_predicate() && reg.comps() == 1);
None
}
_ => panic!("Not a boolean source"),
}
}
pub fn as_imm_not_i20(&self) -> Option<u32> {
match self.src_ref {
SrcRef::Imm32(i) => {
assert!(self.is_unmodified());
let top = i & 0xfff80000;
if top == 0 || top == 0xfff80000 {
None
} else {
Some(i)
}
}
_ => None,
}
}
pub fn as_imm_not_f20(&self) -> Option<u32> {
match self.src_ref {
SrcRef::Imm32(i) => {
assert!(self.is_unmodified());
if (i & 0xfff) == 0 {
None
} else {
Some(i)
}
}
_ => None,
}
}
pub fn iter_ssa(&self) -> slice::Iter<'_, SSAValue> {
self.src_ref.iter_ssa()
}
pub fn iter_ssa_mut(&mut self) -> slice::IterMut<'_, SSAValue> {
self.src_ref.iter_ssa_mut()
}
pub fn is_uniform(&self) -> bool {
match &self.src_ref {
SrcRef::Zero
| SrcRef::True
| SrcRef::False
| SrcRef::Imm32(_)
| SrcRef::CBuf(_) => true,
SrcRef::SSA(ssa) => ssa.is_uniform(),
SrcRef::Reg(reg) => reg.is_uniform(),
}
}
pub fn is_bindless_cbuf(&self) -> bool {
self.src_ref.is_bindless_cbuf()
}
pub fn is_upred_reg(&self) -> bool {
match &self.src_ref {
SrcRef::SSA(ssa) => ssa.file() == RegFile::UPred,
SrcRef::Reg(reg) => reg.file() == RegFile::UPred,
_ => false,
}
}
pub fn is_predicate(&self) -> bool {
self.src_ref.is_predicate()
}
pub fn is_zero(&self) -> bool {
match self.src_ref {
SrcRef::Zero | SrcRef::Imm32(0) => match self.src_mod {
SrcMod::None | SrcMod::FAbs => true,
SrcMod::FNeg | SrcMod::FNegAbs | SrcMod::BNot => false,
// INeg affects more than just the 32 bits of input data so -0
// may not be equivalent to 0.
SrcMod::INeg => false,
},
_ => false,
}
}
pub fn is_nonzero(&self) -> bool {
assert!(self.is_unmodified());
matches!(self.src_ref, SrcRef::Imm32(x) if x != 0)
}
pub fn is_true(&self) -> bool {
self.as_bool() == Some(true)
}
pub fn is_fneg_zero(&self, src_type: SrcType) -> bool {
match self.as_u32(src_type) {
Some(0x00008000) => src_type == SrcType::F16,
Some(0x80000000) => {
src_type == SrcType::F32 || src_type == SrcType::F64
}
Some(0x80008000) => src_type == SrcType::F16v2,
_ => false,
}
}
#[allow(dead_code)]
pub fn supports_type(&self, src_type: &SrcType) -> bool {
match src_type {
SrcType::SSA => {
if !self.is_unmodified() {
return false;
}
matches!(self.src_ref, SrcRef::SSA(_) | SrcRef::Reg(_))
}
SrcType::GPR => {
if !self.is_unmodified() {
return false;
}
matches!(
self.src_ref,
SrcRef::Zero | SrcRef::SSA(_) | SrcRef::Reg(_)
)
}
SrcType::ALU => self.is_unmodified() && self.src_ref.is_alu(),
SrcType::F16 | SrcType::F32 | SrcType::F64 | SrcType::F16v2 => {
match self.src_mod {
SrcMod::None
| SrcMod::FAbs
| SrcMod::FNeg
| SrcMod::FNegAbs => (),
_ => return false,
}
self.src_ref.is_alu()
}
SrcType::I32 => {
match self.src_mod {
SrcMod::None | SrcMod::INeg => (),
_ => return false,
}
self.src_ref.is_alu()
}
SrcType::B32 => {
match self.src_mod {
SrcMod::None | SrcMod::BNot => (),
_ => return false,
}
self.src_ref.is_alu()
}
SrcType::Pred => {
match self.src_mod {
SrcMod::None | SrcMod::BNot => (),
_ => return false,
}
self.src_ref.is_predicate()
}
SrcType::Carry => self.is_unmodified() && self.src_ref.is_carry(),
SrcType::Bar => self.is_unmodified() && self.src_ref.is_barrier(),
}
}
}
impl<T: Into<SrcRef>> From<T> for Src {
fn from(value: T) -> Src {
Src {
src_ref: value.into(),
src_mod: SrcMod::None,
src_swizzle: SrcSwizzle::None,
}
}
}
impl From<Pred> for Src {
fn from(value: Pred) -> Self {
Src {
src_ref: value.pred_ref.into(),
src_mod: if value.pred_inv {
SrcMod::BNot
} else {
SrcMod::None
},
src_swizzle: SrcSwizzle::None,
}
}
}
impl fmt::Display for Src {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self.src_mod {
SrcMod::None => write!(f, "{}{}", self.src_ref, self.src_swizzle),
SrcMod::FAbs => write!(f, "|{}{}|", self.src_ref, self.src_swizzle),
SrcMod::FNeg => write!(f, "-{}{}", self.src_ref, self.src_swizzle),
SrcMod::FNegAbs => {
write!(f, "-|{}{}|", self.src_ref, self.src_swizzle)
}
SrcMod::INeg => write!(f, "-{}{}", self.src_ref, self.src_swizzle),
SrcMod::BNot => write!(f, "!{}{}", self.src_ref, self.src_swizzle),
}
}
}
#[repr(u8)]
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
pub enum SrcType {
SSA,
GPR,
ALU,
F16,
F16v2,
F32,
F64,
I32,
B32,
Pred,
Carry,
Bar,
}
impl SrcType {
const DEFAULT: SrcType = SrcType::GPR;
}
pub type SrcTypeList = AttrList<SrcType>;
pub trait SrcsAsSlice: AsSlice<Src, Attr = SrcType> {
fn srcs_as_slice(&self) -> &[Src] {
self.as_slice()
}
fn srcs_as_mut_slice(&mut self) -> &mut [Src] {
self.as_mut_slice()
}
fn src_types(&self) -> SrcTypeList {
self.attrs()
}
fn src_idx(&self, src: &Src) -> usize {
let r = self.srcs_as_slice().as_ptr_range();
assert!(r.contains(&(src as *const Src)));
unsafe { (src as *const Src).offset_from(r.start) as usize }
}
}
impl<T: AsSlice<Src, Attr = SrcType>> SrcsAsSlice for T {}
fn all_dsts_uniform(dsts: &[Dst]) -> bool {
let mut uniform = None;
for dst in dsts {
let dst_uniform = match dst {
Dst::None => continue,
Dst::Reg(r) => r.is_uniform(),
Dst::SSA(r) => r.file().is_uniform(),
};
assert!(uniform.is_none() || uniform == Some(dst_uniform));
uniform = Some(dst_uniform);
}
uniform == Some(true)
}
#[repr(u8)]
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
pub enum DstType {
Pred,
GPR,
F16,
F16v2,
F32,
F64,
Carry,
Bar,
Vec,
}
impl DstType {
const DEFAULT: DstType = DstType::Vec;
}
pub type DstTypeList = AttrList<DstType>;
pub trait DstsAsSlice: AsSlice<Dst, Attr = DstType> {
fn dsts_as_slice(&self) -> &[Dst] {
self.as_slice()
}
fn dsts_as_mut_slice(&mut self) -> &mut [Dst] {
self.as_mut_slice()
}
// Currently only used by test code
#[allow(dead_code)]
fn dst_types(&self) -> DstTypeList {
self.attrs()
}
fn dst_idx(&self, dst: &Dst) -> usize {
let r = self.dsts_as_slice().as_ptr_range();
assert!(r.contains(&(dst as *const Dst)));
unsafe { (dst as *const Dst).offset_from(r.start) as usize }
}
}
impl<T: AsSlice<Dst, Attr = DstType>> DstsAsSlice for T {}
pub trait IsUniform {
fn is_uniform(&self) -> bool;
}
impl<T: DstsAsSlice> IsUniform for T {
fn is_uniform(&self) -> bool {
all_dsts_uniform(self.dsts_as_slice())
}
}
fn fmt_dst_slice(f: &mut fmt::Formatter<'_>, dsts: &[Dst]) -> fmt::Result {
if dsts.is_empty() {
return Ok(());
}
// Figure out the last non-null dst
//
// Note: By making the top inclusive and starting at 0, we ensure that
// at least one dst always gets printed.
let mut last_dst = 0;
for (i, dst) in dsts.iter().enumerate() {
if !dst.is_none() {
last_dst = i;
}
}
for i in 0..(last_dst + 1) {
if i != 0 {
write!(f, " ")?;
}
write!(f, "{}", &dsts[i])?;
}
Ok(())
}
#[allow(dead_code)]
#[derive(Clone, Copy)]
pub enum FoldData {
Pred(bool),
Carry(bool),
U32(u32),
Vec2([u32; 2]),
}
pub struct OpFoldData<'a> {
pub dsts: &'a mut [FoldData],
pub srcs: &'a [FoldData],
}
impl OpFoldData<'_> {
#[allow(dead_code)]
pub fn get_pred_src(&self, op: &impl SrcsAsSlice, src: &Src) -> bool {
let i = op.src_idx(src);
let b = match src.src_ref {
SrcRef::Zero | SrcRef::Imm32(_) => panic!("Expected a predicate"),
SrcRef::True => true,
SrcRef::False => false,
_ => {
if let FoldData::Pred(b) = self.srcs[i] {
b
} else {
panic!("FoldData is not a predicate");
}
}
};
b ^ src.src_mod.is_bnot()
}
pub fn get_u32_src(&self, op: &impl SrcsAsSlice, src: &Src) -> u32 {
let i = op.src_idx(src);
match src.src_ref {
SrcRef::Zero => 0,
SrcRef::Imm32(imm) => imm,
SrcRef::True | SrcRef::False => panic!("Unexpected predicate"),
_ => {
if let FoldData::U32(u) = self.srcs[i] {
u
} else {
panic!("FoldData is not a U32");
}
}
}
}
#[allow(dead_code)]
pub fn get_u32_bnot_src(&self, op: &impl SrcsAsSlice, src: &Src) -> u32 {
let x = self.get_u32_src(op, src);
if src.src_mod.is_bnot() {
!x
} else {
x
}
}
#[allow(dead_code)]
pub fn get_carry_src(&self, op: &impl SrcsAsSlice, src: &Src) -> bool {
assert!(src.src_ref.as_ssa().is_some());
let i = op.src_idx(src);
if let FoldData::Carry(b) = self.srcs[i] {
b
} else {
panic!("FoldData is not a predicate");
}
}
#[allow(dead_code)]
pub fn get_f32_src(&self, op: &impl SrcsAsSlice, src: &Src) -> f32 {
f32::from_bits(self.get_u32_src(op, src))
}
#[allow(dead_code)]
pub fn get_f64_src(&self, op: &impl SrcsAsSlice, src: &Src) -> f64 {
let i = op.src_idx(src);
match src.src_ref {
SrcRef::Zero => 0.0,
SrcRef::Imm32(imm) => f64::from_bits(u64::from(imm) << 32),
SrcRef::True | SrcRef::False => panic!("Unexpected predicate"),
_ => {
if let FoldData::Vec2(v) = self.srcs[i] {
let u = u64::from(v[0]) | (u64::from(v[1]) << 32);
f64::from_bits(u)
} else {
panic!("FoldData is not a U32");
}
}
}
}
#[allow(dead_code)]
pub fn set_pred_dst(&mut self, op: &impl DstsAsSlice, dst: &Dst, b: bool) {
self.dsts[op.dst_idx(dst)] = FoldData::Pred(b);
}
#[allow(dead_code)]
pub fn set_carry_dst(&mut self, op: &impl DstsAsSlice, dst: &Dst, b: bool) {
self.dsts[op.dst_idx(dst)] = FoldData::Carry(b);
}
pub fn set_u32_dst(&mut self, op: &impl DstsAsSlice, dst: &Dst, u: u32) {
self.dsts[op.dst_idx(dst)] = FoldData::U32(u);
}
#[allow(dead_code)]
pub fn set_f32_dst(&mut self, op: &impl DstsAsSlice, dst: &Dst, f: f32) {
self.set_u32_dst(op, dst, f.to_bits());
}
#[allow(dead_code)]
pub fn set_f64_dst(&mut self, op: &impl DstsAsSlice, dst: &Dst, f: f64) {
let u = f.to_bits();
let v = [u as u32, (u >> 32) as u32];
self.dsts[op.dst_idx(dst)] = FoldData::Vec2(v);
}
}
pub trait Foldable: SrcsAsSlice + DstsAsSlice {
// Currently only used by test code
#[allow(dead_code)]
fn fold(&self, sm: &dyn ShaderModel, f: &mut OpFoldData<'_>);
}
pub trait DisplayOp: DstsAsSlice {
fn fmt_dsts(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt_dst_slice(f, self.dsts_as_slice())
}
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result;
}
// Hack struct so we can re-use Formatters. Shamelessly stolen from
// https://users.rust-lang.org/t/reusing-an-fmt-formatter/8531/4
pub struct Fmt<F>(pub F)
where
F: Fn(&mut fmt::Formatter) -> fmt::Result;
impl<F> fmt::Display for Fmt<F>
where
F: Fn(&mut fmt::Formatter) -> fmt::Result,
{
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
(self.0)(f)
}
}
macro_rules! impl_display_for_op {
($op: ident) => {
impl fmt::Display for $op {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut s = String::new();
write!(s, "{}", Fmt(|f| self.fmt_dsts(f)))?;
if !s.is_empty() {
write!(f, "{} = ", s)?;
}
self.fmt_op(f)
}
}
};
}
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub enum PredSetOp {
And,
Or,
Xor,
}
impl PredSetOp {
#[allow(dead_code)]
pub fn eval(&self, a: bool, b: bool) -> bool {
match self {
PredSetOp::And => a & b,
PredSetOp::Or => a | b,
PredSetOp::Xor => a ^ b,
}
}
pub fn is_trivial(&self, accum: &Src) -> bool {
if let Some(b) = accum.as_bool() {
match self {
PredSetOp::And => b,
PredSetOp::Or => !b,
PredSetOp::Xor => !b,
}
} else {
false
}
}
}
impl fmt::Display for PredSetOp {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
PredSetOp::And => write!(f, ".and"),
PredSetOp::Or => write!(f, ".or"),
PredSetOp::Xor => write!(f, ".xor"),
}
}
}
#[allow(dead_code)]
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub enum FloatCmpOp {
OrdEq,
OrdNe,
OrdLt,
OrdLe,
OrdGt,
OrdGe,
UnordEq,
UnordNe,
UnordLt,
UnordLe,
UnordGt,
UnordGe,
IsNum,
IsNan,
}
impl FloatCmpOp {
pub fn flip(self) -> FloatCmpOp {
match self {
FloatCmpOp::OrdEq | FloatCmpOp::OrdNe => self,
FloatCmpOp::OrdLt => FloatCmpOp::OrdGt,
FloatCmpOp::OrdLe => FloatCmpOp::OrdGe,
FloatCmpOp::OrdGt => FloatCmpOp::OrdLt,
FloatCmpOp::OrdGe => FloatCmpOp::OrdLe,
FloatCmpOp::UnordEq | FloatCmpOp::UnordNe => self,
FloatCmpOp::UnordLt => FloatCmpOp::UnordGt,
FloatCmpOp::UnordLe => FloatCmpOp::UnordGe,
FloatCmpOp::UnordGt => FloatCmpOp::UnordLt,
FloatCmpOp::UnordGe => FloatCmpOp::UnordLe,
FloatCmpOp::IsNum | FloatCmpOp::IsNan => panic!("Cannot flip unop"),
}
}
}
impl fmt::Display for FloatCmpOp {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
FloatCmpOp::OrdEq => write!(f, ".eq"),
FloatCmpOp::OrdNe => write!(f, ".ne"),
FloatCmpOp::OrdLt => write!(f, ".lt"),
FloatCmpOp::OrdLe => write!(f, ".le"),
FloatCmpOp::OrdGt => write!(f, ".gt"),
FloatCmpOp::OrdGe => write!(f, ".ge"),
FloatCmpOp::UnordEq => write!(f, ".equ"),
FloatCmpOp::UnordNe => write!(f, ".neu"),
FloatCmpOp::UnordLt => write!(f, ".ltu"),
FloatCmpOp::UnordLe => write!(f, ".leu"),
FloatCmpOp::UnordGt => write!(f, ".gtu"),
FloatCmpOp::UnordGe => write!(f, ".geu"),
FloatCmpOp::IsNum => write!(f, ".num"),
FloatCmpOp::IsNan => write!(f, ".nan"),
}
}
}
#[allow(dead_code)]
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub enum IntCmpOp {
False,
True,
Eq,
Ne,
Lt,
Le,
Gt,
Ge,
}
impl IntCmpOp {
pub fn flip(self) -> IntCmpOp {
match self {
IntCmpOp::False | IntCmpOp::True => self,
IntCmpOp::Eq | IntCmpOp::Ne => self,
IntCmpOp::Lt => IntCmpOp::Gt,
IntCmpOp::Le => IntCmpOp::Ge,
IntCmpOp::Gt => IntCmpOp::Lt,
IntCmpOp::Ge => IntCmpOp::Le,
}
}
}
impl fmt::Display for IntCmpOp {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
IntCmpOp::False => write!(f, ".f"),
IntCmpOp::True => write!(f, ".t"),
IntCmpOp::Eq => write!(f, ".eq"),
IntCmpOp::Ne => write!(f, ".ne"),
IntCmpOp::Lt => write!(f, ".lt"),
IntCmpOp::Le => write!(f, ".le"),
IntCmpOp::Gt => write!(f, ".gt"),
IntCmpOp::Ge => write!(f, ".ge"),
}
}
}
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub enum IntCmpType {
U32,
I32,
}
impl IntCmpType {
#[allow(dead_code)]
pub fn is_signed(&self) -> bool {
match self {
IntCmpType::U32 => false,
IntCmpType::I32 => true,
}
}
}
impl fmt::Display for IntCmpType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
IntCmpType::U32 => write!(f, ".u32"),
IntCmpType::I32 => write!(f, ".i32"),
}
}
}
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub enum LogicOp2 {
And,
Or,
Xor,
PassB,
}
impl fmt::Display for LogicOp2 {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
LogicOp2::And => write!(f, "and"),
LogicOp2::Or => write!(f, "or"),
LogicOp2::Xor => write!(f, "xor"),
LogicOp2::PassB => write!(f, "pass_b"),
}
}
}
impl LogicOp2 {
pub fn to_lut(self) -> LogicOp3 {
match self {
LogicOp2::And => LogicOp3::new_lut(&|x, y, _| x & y),
LogicOp2::Or => LogicOp3::new_lut(&|x, y, _| x | y),
LogicOp2::Xor => LogicOp3::new_lut(&|x, y, _| x ^ y),
LogicOp2::PassB => LogicOp3::new_lut(&|_, b, _| b),
}
}
}
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub struct LogicOp3 {
pub lut: u8,
}
impl LogicOp3 {
pub const SRC_MASKS: [u8; 3] = [0xf0, 0xcc, 0xaa];
#[inline]
pub fn new_lut<F: Fn(u8, u8, u8) -> u8>(f: &F) -> LogicOp3 {
LogicOp3 {
lut: f(
LogicOp3::SRC_MASKS[0],
LogicOp3::SRC_MASKS[1],
LogicOp3::SRC_MASKS[2],
),
}
}
pub fn new_const(val: bool) -> LogicOp3 {
LogicOp3 {
lut: if val { !0 } else { 0 },
}
}
pub fn src_used(&self, src_idx: usize) -> bool {
let mask = LogicOp3::SRC_MASKS[src_idx];
let shift = LogicOp3::SRC_MASKS[src_idx].trailing_zeros();
self.lut & !mask != (self.lut >> shift) & !mask
}
pub fn fix_src(&mut self, src_idx: usize, val: bool) {
let mask = LogicOp3::SRC_MASKS[src_idx];
let shift = LogicOp3::SRC_MASKS[src_idx].trailing_zeros();
if val {
let t_bits = self.lut & mask;
self.lut = t_bits | (t_bits >> shift)
} else {
let f_bits = self.lut & !mask;
self.lut = (f_bits << shift) | f_bits
};
}
pub fn invert_src(&mut self, src_idx: usize) {
let mask = LogicOp3::SRC_MASKS[src_idx];
let shift = LogicOp3::SRC_MASKS[src_idx].trailing_zeros();
let t_bits = self.lut & mask;
let f_bits = self.lut & !mask;
self.lut = (f_bits << shift) | (t_bits >> shift);
}
pub fn eval<
T: BitAnd<Output = T> + BitOr<Output = T> + Copy + Not<Output = T>,
>(
&self,
x: T,
y: T,
z: T,
) -> T {
let mut res = x & !x; // zero
if (self.lut & (1 << 0)) != 0 {
res = res | (!x & !y & !z);
}
if (self.lut & (1 << 1)) != 0 {
res = res | (!x & !y & z);
}
if (self.lut & (1 << 2)) != 0 {
res = res | (!x & y & !z);
}
if (self.lut & (1 << 3)) != 0 {
res = res | (!x & y & z);
}
if (self.lut & (1 << 4)) != 0 {
res = res | (x & !y & !z);
}
if (self.lut & (1 << 5)) != 0 {
res = res | (x & !y & z);
}
if (self.lut & (1 << 6)) != 0 {
res = res | (x & y & !z);
}
if (self.lut & (1 << 7)) != 0 {
res = res | (x & y & z);
}
res
}
}
impl fmt::Display for LogicOp3 {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "LUT[{:#x}]", self.lut)
}
}
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub enum FloatType {
F16,
F32,
F64,
}
impl FloatType {
pub fn from_bits(bytes: usize) -> FloatType {
match bytes {
16 => FloatType::F16,
32 => FloatType::F32,
64 => FloatType::F64,
_ => panic!("Invalid float type size"),
}
}
pub fn bits(&self) -> usize {
match self {
FloatType::F16 => 16,
FloatType::F32 => 32,
FloatType::F64 => 64,
}
}
}
impl fmt::Display for FloatType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
FloatType::F16 => write!(f, ".f16"),
FloatType::F32 => write!(f, ".f32"),
FloatType::F64 => write!(f, ".f64"),
}
}
}
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub enum FRndMode {
NearestEven,
NegInf,
PosInf,
Zero,
}
impl fmt::Display for FRndMode {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
FRndMode::NearestEven => write!(f, ".re"),
FRndMode::NegInf => write!(f, ".rm"),
FRndMode::PosInf => write!(f, ".rp"),
FRndMode::Zero => write!(f, ".rz"),
}
}
}
#[derive(Clone, Copy, Eq, PartialEq)]
pub struct TexCBufRef {
pub idx: u8,
pub offset: u16,
}
#[allow(dead_code)]
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum TexRef {
Bound(u16),
CBuf(TexCBufRef),
Bindless,
}
impl fmt::Display for TexRef {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
TexRef::Bound(idx) => write!(f, "tex[{idx}]"),
TexRef::CBuf(TexCBufRef { idx, offset }) => {
write!(f, "c[{idx:#x}][{offset:#x}]")
}
TexRef::Bindless => write!(f, "bindless"),
}
}
}
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum TexDim {
_1D,
Array1D,
_2D,
Array2D,
_3D,
Cube,
ArrayCube,
}
impl fmt::Display for TexDim {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
TexDim::_1D => write!(f, ".1d"),
TexDim::Array1D => write!(f, ".a1d"),
TexDim::_2D => write!(f, ".2d"),
TexDim::Array2D => write!(f, ".a2d"),
TexDim::_3D => write!(f, ".3d"),
TexDim::Cube => write!(f, ".cube"),
TexDim::ArrayCube => write!(f, ".acube"),
}
}
}
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum TexLodMode {
Auto,
Zero,
Bias,
Lod,
Clamp,
BiasClamp,
}
impl TexLodMode {
pub fn is_explicit_lod(&self) -> bool {
match self {
TexLodMode::Auto
| TexLodMode::Bias
| TexLodMode::Clamp
| TexLodMode::BiasClamp => false,
TexLodMode::Zero | TexLodMode::Lod => true,
}
}
}
impl fmt::Display for TexLodMode {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
TexLodMode::Auto => write!(f, ""),
TexLodMode::Zero => write!(f, ".lz"),
TexLodMode::Bias => write!(f, ".lb"),
TexLodMode::Lod => write!(f, ".ll"),
TexLodMode::Clamp => write!(f, ".lc"),
TexLodMode::BiasClamp => write!(f, ".lb.lc"),
}
}
}
/// Derivative behavior for tex ops and FSwzAdd
///
/// The descriptions here may not be wholly accurate as they come from cobbling
/// together a bunch of pieces. This is my (Faith's) best understanding of how
/// these things work.
#[allow(dead_code)]
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum TexDerivMode {
/// Automatic
///
/// For partial (not full) quads, the derivative will default to the value
/// of DEFAULT_PARTIAL in SET_SHADER_CONTROL.
///
/// On Volta and earlier GPUs or on Blackwell B and later, derivatives in
/// all non-fragment shaders stages are assumed to be partial.
Auto,
/// Assume a non-divergent (full) derivative
///
/// Partial derivative checks are skipped and the hardware does the
/// derivative anyway, possibly on rubbish data.
NonDivergent,
/// Force the derivative to be considered divergent (partial)
///
/// This only exists as a separate thing on Blackwell A. On Hopper and
/// earlier, there is a .fdv that's part of the LodMode, but only for
/// LodMode::Clamp. On Blackwell B, it appears (according to the
/// disassembler) to be removed again in favor of DerivXY.
ForceDivergent,
/// Attempt an X/Y derivative, ignoring shader stage
///
/// This is (I think) identical to Auto except that it ignores the shader
/// stage checks. This is new on Blackwell B+.
DerivXY,
}
impl fmt::Display for TexDerivMode {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
TexDerivMode::Auto => Ok(()),
TexDerivMode::NonDivergent => write!(f, ".ndv"),
TexDerivMode::ForceDivergent => write!(f, ".fdv"),
TexDerivMode::DerivXY => write!(f, ".dxy"),
}
}
}
#[derive(Clone, Copy, Eq, PartialEq)]
pub struct ChannelMask(u8);
impl ChannelMask {
pub fn new(mask: u8) -> Self {
assert!(mask != 0 && (mask & !0xf) == 0);
ChannelMask(mask)
}
pub fn for_comps(comps: u8) -> Self {
assert!(comps > 0 && comps <= 4);
ChannelMask((1 << comps) - 1)
}
pub fn to_bits(self) -> u8 {
self.0
}
}
impl fmt::Display for ChannelMask {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, ".")?;
for (i, c) in ['r', 'g', 'b', 'a'].into_iter().enumerate() {
if self.0 & (1 << i) != 0 {
write!(f, "{c}")?;
}
}
Ok(())
}
}
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum TexOffsetMode {
None,
AddOffI,
PerPx, // tld4 only
}
impl fmt::Display for TexOffsetMode {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
TexOffsetMode::None => write!(f, ""),
TexOffsetMode::AddOffI => write!(f, ".aoffi"),
TexOffsetMode::PerPx => write!(f, ".ptp"),
}
}
}
#[allow(dead_code)]
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum TexQuery {
Dimension,
TextureType,
SamplerPos,
}
impl fmt::Display for TexQuery {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
TexQuery::Dimension => write!(f, "dimension"),
TexQuery::TextureType => write!(f, "texture_type"),
TexQuery::SamplerPos => write!(f, "sampler_pos"),
}
}
}
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum ImageDim {
_1D,
_1DBuffer,
_1DArray,
_2D,
_2DArray,
_3D,
}
impl ImageDim {
pub fn coord_comps(&self) -> u8 {
match self {
ImageDim::_1D => 1,
ImageDim::_1DBuffer => 1,
ImageDim::_1DArray => 2,
ImageDim::_2D => 2,
ImageDim::_2DArray => 3,
ImageDim::_3D => 3,
}
}
}
impl fmt::Display for ImageDim {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
ImageDim::_1D => write!(f, ".1d"),
ImageDim::_1DBuffer => write!(f, ".buf"),
ImageDim::_1DArray => write!(f, ".a1d"),
ImageDim::_2D => write!(f, ".2d"),
ImageDim::_2DArray => write!(f, ".a2d"),
ImageDim::_3D => write!(f, ".3d"),
}
}
}
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
pub enum IntType {
U8,
I8,
U16,
I16,
U32,
I32,
U64,
I64,
}
impl IntType {
pub fn from_bits(bits: usize, is_signed: bool) -> IntType {
match bits {
8 => {
if is_signed {
IntType::I8
} else {
IntType::U8
}
}
16 => {
if is_signed {
IntType::I16
} else {
IntType::U16
}
}
32 => {
if is_signed {
IntType::I32
} else {
IntType::U32
}
}
64 => {
if is_signed {
IntType::I64
} else {
IntType::U64
}
}
_ => panic!("Invalid integer type size"),
}
}
pub fn is_signed(&self) -> bool {
match self {
IntType::U8 | IntType::U16 | IntType::U32 | IntType::U64 => false,
IntType::I8 | IntType::I16 | IntType::I32 | IntType::I64 => true,
}
}
pub fn bits(&self) -> usize {
match self {
IntType::U8 | IntType::I8 => 8,
IntType::U16 | IntType::I16 => 16,
IntType::U32 | IntType::I32 => 32,
IntType::U64 | IntType::I64 => 64,
}
}
}
impl fmt::Display for IntType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
IntType::U8 => write!(f, ".u8"),
IntType::I8 => write!(f, ".i8"),
IntType::U16 => write!(f, ".u16"),
IntType::I16 => write!(f, ".i16"),
IntType::U32 => write!(f, ".u32"),
IntType::I32 => write!(f, ".i32"),
IntType::U64 => write!(f, ".u64"),
IntType::I64 => write!(f, ".i64"),
}
}
}
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub enum MemAddrType {
A32,
A64,
}
impl fmt::Display for MemAddrType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
MemAddrType::A32 => write!(f, ".a32"),
MemAddrType::A64 => write!(f, ".a64"),
}
}
}
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub enum MemType {
U8,
I8,
U16,
I16,
B32,
B64,
B128,
}
impl MemType {
pub fn from_size(size: u8, is_signed: bool) -> MemType {
match size {
1 => {
if is_signed {
MemType::I8
} else {
MemType::U8
}
}
2 => {
if is_signed {
MemType::I16
} else {
MemType::U16
}
}
4 => MemType::B32,
8 => MemType::B64,
16 => MemType::B128,
_ => panic!("Invalid memory load/store size"),
}
}
#[allow(dead_code)]
pub fn bits(&self) -> usize {
match self {
MemType::U8 | MemType::I8 => 8,
MemType::U16 | MemType::I16 => 16,
MemType::B32 => 32,
MemType::B64 => 64,
MemType::B128 => 128,
}
}
}
impl fmt::Display for MemType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
MemType::U8 => write!(f, ".u8"),
MemType::I8 => write!(f, ".i8"),
MemType::U16 => write!(f, ".u16"),
MemType::I16 => write!(f, ".i16"),
MemType::B32 => write!(f, ".b32"),
MemType::B64 => write!(f, ".b64"),
MemType::B128 => write!(f, ".b128"),
}
}
}
#[allow(dead_code)]
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub enum MemOrder {
Constant,
Weak,
Strong(MemScope),
}
impl fmt::Display for MemOrder {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
MemOrder::Constant => write!(f, ".constant"),
MemOrder::Weak => write!(f, ".weak"),
MemOrder::Strong(scope) => write!(f, ".strong{}", scope),
}
}
}
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub enum MemScope {
CTA,
GPU,
System,
}
impl fmt::Display for MemScope {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
MemScope::CTA => write!(f, ".cta"),
MemScope::GPU => write!(f, ".gpu"),
MemScope::System => write!(f, ".sys"),
}
}
}
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub enum MemSpace {
Global(MemAddrType),
Local,
Shared,
}
impl MemSpace {
pub fn addr_type(&self) -> MemAddrType {
match self {
MemSpace::Global(t) => *t,
MemSpace::Local => MemAddrType::A32,
MemSpace::Shared => MemAddrType::A32,
}
}
}
impl fmt::Display for MemSpace {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
MemSpace::Global(t) => write!(f, ".global{t}"),
MemSpace::Local => write!(f, ".local"),
MemSpace::Shared => write!(f, ".shared"),
}
}
}
#[allow(dead_code)]
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub enum MemEvictionPriority {
First,
Normal,
Last,
LastUse,
Unchanged,
NoAllocate,
}
impl fmt::Display for MemEvictionPriority {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
MemEvictionPriority::First => write!(f, ".ef"),
MemEvictionPriority::Normal => Ok(()),
MemEvictionPriority::Last => write!(f, ".el"),
MemEvictionPriority::LastUse => write!(f, ".lu"),
MemEvictionPriority::Unchanged => write!(f, ".eu"),
MemEvictionPriority::NoAllocate => write!(f, ".na"),
}
}
}
/// Memory load cache ops used by Kepler
#[allow(dead_code)]
#[derive(Clone, Copy, Default, Eq, Hash, PartialEq)]
pub enum LdCacheOp {
#[default]
CacheAll,
CacheGlobal,
/// This cache mode not officially documented by NVIDIA. What we do know is
/// that the Cuda C programming gude says:
///
/// > The read-only data cache load function is only supported by devices
/// > of compute capability 5.0 and higher.
/// > ```c
/// > T __ldg(const T* address);
/// > ```
///
/// and we know that `__ldg()` compiles to `ld.global.nc` in PTX which
/// compiles to `ld.ci`. The PTX 5.0 docs say:
///
/// > Load register variable `d` from the location specified by the source
/// > address operand `a` in the global state space, and optionally cache in
/// > non-coherent texture cache. Since the cache is non-coherent, the data
/// > should be read-only within the kernel's process.
///
/// Since `.nc` means "non-coherent", the name "incoherent" seems about
/// right. The quote above also seems to imply that these loads got loaded
/// through the texture cache but we don't fully understand the implications
/// of that.
CacheIncoherent,
CacheStreaming,
CacheInvalidate,
}
impl fmt::Display for LdCacheOp {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
LdCacheOp::CacheAll => write!(f, ".ca"),
LdCacheOp::CacheGlobal => write!(f, ".cg"),
LdCacheOp::CacheIncoherent => write!(f, ".ci"),
LdCacheOp::CacheStreaming => write!(f, ".cs"),
LdCacheOp::CacheInvalidate => write!(f, ".cv"),
}
}
}
impl LdCacheOp {
pub fn select(
sm: &dyn ShaderModel,
space: MemSpace,
order: MemOrder,
_eviction_priority: MemEvictionPriority,
) -> Self {
match space {
MemSpace::Global(_) => match order {
MemOrder::Constant => {
if sm.sm() >= 50 {
// This is undocumented in the CUDA docs but NVIDIA uses
// it for constant loads.
LdCacheOp::CacheIncoherent
} else {
LdCacheOp::CacheAll
}
}
MemOrder::Strong(MemScope::System) => {
LdCacheOp::CacheInvalidate
}
_ => {
// From the CUDA 10.2 docs:
//
// "The default load instruction cache operation is
// ld.ca, which allocates cache lines in all levels (L1
// and L2) with normal eviction policy. Global data is
// coherent at the L2 level, but multiple L1 caches are
// not coherent for global data. If one thread stores to
// global memory via one L1 cache, and a second thread
// loads that address via a second L1 cache with ld.ca,
// the second thread may get stale L1 cache data"
//
// and
//
// "L1 caching in Kepler GPUs is reserved only for local
// memory accesses, such as register spills and stack
// data. Global loads are cached in L2 only (or in the
// Read-Only Data Cache)."
//
// We follow suit and use CacheGlobal for all global memory
// access on Kepler. On Maxwell, it appears safe to use
// CacheAll for everything.
if sm.sm() >= 50 {
LdCacheOp::CacheAll
} else {
LdCacheOp::CacheGlobal
}
}
},
MemSpace::Local | MemSpace::Shared => LdCacheOp::CacheAll,
}
}
}
/// Memory store cache ops used by Kepler
#[allow(dead_code)]
#[derive(Clone, Copy, Default, Eq, Hash, PartialEq)]
pub enum StCacheOp {
#[default]
WriteBack,
CacheGlobal,
CacheStreaming,
WriteThrough,
}
impl fmt::Display for StCacheOp {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
StCacheOp::WriteBack => write!(f, ".wb"),
StCacheOp::CacheGlobal => write!(f, ".cg"),
StCacheOp::CacheStreaming => write!(f, ".cs"),
StCacheOp::WriteThrough => write!(f, ".wt"),
}
}
}
impl StCacheOp {
pub fn select(
sm: &dyn ShaderModel,
space: MemSpace,
order: MemOrder,
_eviction_priority: MemEvictionPriority,
) -> Self {
match space {
MemSpace::Global(_) => match order {
MemOrder::Constant => panic!("Cannot store to constant"),
MemOrder::Strong(MemScope::System) => StCacheOp::WriteThrough,
_ => {
// See the corresponding comment in LdCacheOp::select()
if sm.sm() >= 50 {
StCacheOp::WriteBack
} else {
StCacheOp::CacheGlobal
}
}
},
MemSpace::Local | MemSpace::Shared => StCacheOp::WriteBack,
}
}
}
#[derive(Clone)]
pub struct MemAccess {
pub mem_type: MemType,
pub space: MemSpace,
pub order: MemOrder,
pub eviction_priority: MemEvictionPriority,
}
impl fmt::Display for MemAccess {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"{}{}{}{}",
self.space, self.order, self.eviction_priority, self.mem_type,
)
}
}
impl MemAccess {
pub fn ld_cache_op(&self, sm: &dyn ShaderModel) -> LdCacheOp {
LdCacheOp::select(sm, self.space, self.order, self.eviction_priority)
}
pub fn st_cache_op(&self, sm: &dyn ShaderModel) -> StCacheOp {
StCacheOp::select(sm, self.space, self.order, self.eviction_priority)
}
}
#[allow(dead_code)]
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub enum AtomType {
F16x2,
U32,
I32,
F32,
U64,
I64,
F64,
}
impl AtomType {
pub fn F(bits: u8) -> AtomType {
match bits {
16 => panic!("16-bit float atomics not yet supported"),
32 => AtomType::F32,
64 => AtomType::F64,
_ => panic!("Invalid float atomic type"),
}
}
pub fn U(bits: u8) -> AtomType {
match bits {
32 => AtomType::U32,
64 => AtomType::U64,
_ => panic!("Invalid uint atomic type"),
}
}
pub fn I(bits: u8) -> AtomType {
match bits {
32 => AtomType::I32,
64 => AtomType::I64,
_ => panic!("Invalid int atomic type"),
}
}
pub fn bits(&self) -> usize {
match self {
AtomType::F16x2 | AtomType::F32 => 32,
AtomType::U32 | AtomType::I32 => 32,
AtomType::U64 | AtomType::I64 | AtomType::F64 => 64,
}
}
pub fn is_float(&self) -> bool {
match self {
AtomType::F16x2 | AtomType::F32 | AtomType::F64 => true,
AtomType::U32 | AtomType::I32 | AtomType::U64 | AtomType::I64 => {
false
}
}
}
}
impl fmt::Display for AtomType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
AtomType::F16x2 => write!(f, ".f16x2"),
AtomType::U32 => write!(f, ".u32"),
AtomType::I32 => write!(f, ".i32"),
AtomType::F32 => write!(f, ".f32"),
AtomType::U64 => write!(f, ".u64"),
AtomType::I64 => write!(f, ".i64"),
AtomType::F64 => write!(f, ".f64"),
}
}
}
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub enum AtomCmpSrc {
/// The cmpr value is passed as a separate source
Separate,
/// The cmpr value is packed in with the data with cmpr coming first
Packed,
}
#[allow(dead_code)]
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub enum AtomOp {
Add,
Min,
Max,
Inc,
Dec,
And,
Or,
Xor,
Exch,
CmpExch(AtomCmpSrc),
}
impl AtomOp {
pub fn is_reduction(&self) -> bool {
match self {
AtomOp::Add
| AtomOp::Min
| AtomOp::Max
| AtomOp::Inc
| AtomOp::Dec
| AtomOp::And
| AtomOp::Or
| AtomOp::Xor => true,
AtomOp::Exch | AtomOp::CmpExch(_) => false,
}
}
}
impl fmt::Display for AtomOp {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
AtomOp::Add => write!(f, ".add"),
AtomOp::Min => write!(f, ".min"),
AtomOp::Max => write!(f, ".max"),
AtomOp::Inc => write!(f, ".inc"),
AtomOp::Dec => write!(f, ".dec"),
AtomOp::And => write!(f, ".and"),
AtomOp::Or => write!(f, ".or"),
AtomOp::Xor => write!(f, ".xor"),
AtomOp::Exch => write!(f, ".exch"),
AtomOp::CmpExch(AtomCmpSrc::Separate) => write!(f, ".cmpexch"),
AtomOp::CmpExch(AtomCmpSrc::Packed) => write!(f, ".cmpexch.packed"),
}
}
}
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum InterpFreq {
Pass,
PassMulW,
Constant,
State,
}
impl fmt::Display for InterpFreq {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
InterpFreq::Pass => write!(f, ".pass"),
InterpFreq::PassMulW => write!(f, ".pass_mul_w"),
InterpFreq::Constant => write!(f, ".constant"),
InterpFreq::State => write!(f, ".state"),
}
}
}
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum InterpLoc {
Default,
Centroid,
Offset,
}
impl fmt::Display for InterpLoc {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
InterpLoc::Default => Ok(()),
InterpLoc::Centroid => write!(f, ".centroid"),
InterpLoc::Offset => write!(f, ".offset"),
}
}
}
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpFAdd {
#[dst_type(F32)]
pub dst: Dst,
#[src_type(F32)]
pub srcs: [Src; 2],
pub saturate: bool,
pub rnd_mode: FRndMode,
pub ftz: bool,
}
impl DisplayOp for OpFAdd {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let sat = if self.saturate { ".sat" } else { "" };
write!(f, "fadd{sat}")?;
if self.rnd_mode != FRndMode::NearestEven {
write!(f, "{}", self.rnd_mode)?;
}
if self.ftz {
write!(f, ".ftz")?;
}
write!(f, " {} {}", self.srcs[0], self.srcs[1],)
}
}
impl_display_for_op!(OpFAdd);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpFFma {
#[dst_type(F32)]
pub dst: Dst,
#[src_type(F32)]
pub srcs: [Src; 3],
pub saturate: bool,
pub rnd_mode: FRndMode,
pub ftz: bool,
pub dnz: bool,
}
impl DisplayOp for OpFFma {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let sat = if self.saturate { ".sat" } else { "" };
write!(f, "ffma{sat}")?;
if self.rnd_mode != FRndMode::NearestEven {
write!(f, "{}", self.rnd_mode)?;
}
if self.dnz {
write!(f, ".dnz")?;
} else if self.ftz {
write!(f, ".ftz")?;
}
write!(f, " {} {} {}", self.srcs[0], self.srcs[1], self.srcs[2])
}
}
impl_display_for_op!(OpFFma);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpFMnMx {
#[dst_type(F32)]
pub dst: Dst,
#[src_type(F32)]
pub srcs: [Src; 2],
#[src_type(Pred)]
pub min: Src,
pub ftz: bool,
}
impl DisplayOp for OpFMnMx {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let ftz = if self.ftz { ".ftz" } else { "" };
write!(
f,
"fmnmx{ftz} {} {} {}",
self.srcs[0], self.srcs[1], self.min
)
}
}
impl_display_for_op!(OpFMnMx);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpFMul {
#[dst_type(F32)]
pub dst: Dst,
#[src_type(F32)]
pub srcs: [Src; 2],
pub saturate: bool,
pub rnd_mode: FRndMode,
pub ftz: bool,
pub dnz: bool,
}
impl DisplayOp for OpFMul {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let sat = if self.saturate { ".sat" } else { "" };
write!(f, "fmul{sat}")?;
if self.rnd_mode != FRndMode::NearestEven {
write!(f, "{}", self.rnd_mode)?;
}
if self.dnz {
write!(f, ".dnz")?;
} else if self.ftz {
write!(f, ".ftz")?;
}
write!(f, " {} {}", self.srcs[0], self.srcs[1],)
}
}
impl_display_for_op!(OpFMul);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpFSet {
#[dst_type(F32)]
pub dst: Dst,
pub cmp_op: FloatCmpOp,
#[src_type(F32)]
pub srcs: [Src; 2],
pub ftz: bool,
}
impl DisplayOp for OpFSet {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let ftz = if self.ftz { ".ftz" } else { "" };
write!(
f,
"fset{}{ftz} {} {}",
self.cmp_op, self.srcs[0], self.srcs[1]
)
}
}
impl_display_for_op!(OpFSet);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpFSetP {
#[dst_type(Pred)]
pub dst: Dst,
pub set_op: PredSetOp,
pub cmp_op: FloatCmpOp,
#[src_type(F32)]
pub srcs: [Src; 2],
#[src_type(Pred)]
pub accum: Src,
pub ftz: bool,
}
impl DisplayOp for OpFSetP {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let ftz = if self.ftz { ".ftz" } else { "" };
write!(f, "fsetp{}{ftz}", self.cmp_op)?;
if !self.set_op.is_trivial(&self.accum) {
write!(f, "{}", self.set_op)?;
}
write!(f, " {} {}", self.srcs[0], self.srcs[1])?;
if !self.set_op.is_trivial(&self.accum) {
write!(f, " {}", self.accum)?;
}
Ok(())
}
}
impl_display_for_op!(OpFSetP);
#[allow(dead_code)]
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum FSwzAddOp {
Add,
SubRight,
SubLeft,
MoveLeft,
}
impl fmt::Display for FSwzAddOp {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
FSwzAddOp::Add => write!(f, "add"),
FSwzAddOp::SubRight => write!(f, "subr"),
FSwzAddOp::SubLeft => write!(f, "sub"),
FSwzAddOp::MoveLeft => write!(f, "mov2"),
}
}
}
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpFSwzAdd {
#[dst_type(F32)]
pub dst: Dst,
#[src_type(GPR)]
pub srcs: [Src; 2],
pub rnd_mode: FRndMode,
pub ftz: bool,
pub deriv_mode: TexDerivMode,
pub ops: [FSwzAddOp; 4],
}
impl DisplayOp for OpFSwzAdd {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "fswzadd",)?;
if self.rnd_mode != FRndMode::NearestEven {
write!(f, "{}", self.rnd_mode)?;
}
if self.ftz {
write!(f, ".ftz")?;
}
write!(f, "{}", self.deriv_mode)?;
write!(
f,
" {} {} [{}, {}, {}, {}]",
self.srcs[0],
self.srcs[1],
self.ops[0],
self.ops[1],
self.ops[2],
self.ops[3],
)
}
}
impl_display_for_op!(OpFSwzAdd);
/// Describes where the second src is taken before doing the ops
#[allow(dead_code)]
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum FSwzShuffle {
Quad0,
Quad1,
Quad2,
Quad3,
// swap [0, 1] and [2, 3]
SwapHorizontal,
// swap [0, 2] and [1, 3]
SwapVertical,
}
impl fmt::Display for FSwzShuffle {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
FSwzShuffle::Quad0 => write!(f, ".0000"),
FSwzShuffle::Quad1 => write!(f, ".1111"),
FSwzShuffle::Quad2 => write!(f, ".2222"),
FSwzShuffle::Quad3 => write!(f, ".3333"),
FSwzShuffle::SwapHorizontal => write!(f, ".1032"),
FSwzShuffle::SwapVertical => write!(f, ".2301"),
}
}
}
/// Op only present in Kepler and older
/// It first does a shuffle on the second src and then applies
/// src0 op src1, each thread on a quad might do a different operation.
///
/// This is used to encode ddx/ddy
/// ex: ddx
/// src1 = shuffle swap horizontal src1
/// ops = [sub, subr, sub, subr]
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpFSwz {
#[dst_type(F32)]
pub dst: Dst,
#[src_type(GPR)]
pub srcs: [Src; 2],
pub rnd_mode: FRndMode,
pub ftz: bool,
pub deriv_mode: TexDerivMode,
pub shuffle: FSwzShuffle,
pub ops: [FSwzAddOp; 4],
}
impl DisplayOp for OpFSwz {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "fswz{}", self.shuffle)?;
if self.rnd_mode != FRndMode::NearestEven {
write!(f, "{}", self.rnd_mode)?;
}
write!(f, "{}", self.deriv_mode)?;
if self.ftz {
write!(f, ".ftz")?;
}
write!(
f,
" {} {} [{}, {}, {}, {}]",
self.srcs[0],
self.srcs[1],
self.ops[0],
self.ops[1],
self.ops[2],
self.ops[3],
)
}
}
impl_display_for_op!(OpFSwz);
pub enum RroOp {
SinCos,
Exp2,
}
impl fmt::Display for RroOp {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
RroOp::SinCos => write!(f, ".sincos"),
RroOp::Exp2 => write!(f, ".exp2"),
}
}
}
/// MuFu range reduction operator
///
/// Not available on SM70+
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpRro {
#[dst_type(F32)]
pub dst: Dst,
pub op: RroOp,
#[src_type(F32)]
pub src: Src,
}
impl DisplayOp for OpRro {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "rro{} {}", self.op, self.src)
}
}
impl_display_for_op!(OpRro);
#[allow(dead_code)]
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum MuFuOp {
Cos,
Sin,
Exp2,
Log2,
Rcp,
Rsq,
Rcp64H,
Rsq64H,
Sqrt,
Tanh,
}
impl fmt::Display for MuFuOp {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
MuFuOp::Cos => write!(f, "cos"),
MuFuOp::Sin => write!(f, "sin"),
MuFuOp::Exp2 => write!(f, "exp2"),
MuFuOp::Log2 => write!(f, "log2"),
MuFuOp::Rcp => write!(f, "rcp"),
MuFuOp::Rsq => write!(f, "rsq"),
MuFuOp::Rcp64H => write!(f, "rcp64h"),
MuFuOp::Rsq64H => write!(f, "rsq64h"),
MuFuOp::Sqrt => write!(f, "sqrt"),
MuFuOp::Tanh => write!(f, "tanh"),
}
}
}
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpMuFu {
#[dst_type(F32)]
pub dst: Dst,
pub op: MuFuOp,
#[src_type(F32)]
pub src: Src,
}
impl DisplayOp for OpMuFu {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "mufu.{} {}", self.op, self.src)
}
}
impl_display_for_op!(OpMuFu);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpDAdd {
#[dst_type(F64)]
pub dst: Dst,
#[src_type(F64)]
pub srcs: [Src; 2],
pub rnd_mode: FRndMode,
}
impl DisplayOp for OpDAdd {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "dadd")?;
if self.rnd_mode != FRndMode::NearestEven {
write!(f, "{}", self.rnd_mode)?;
}
write!(f, " {} {}", self.srcs[0], self.srcs[1],)
}
}
impl_display_for_op!(OpDAdd);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpDMul {
#[dst_type(F64)]
pub dst: Dst,
#[src_type(F64)]
pub srcs: [Src; 2],
pub rnd_mode: FRndMode,
}
impl DisplayOp for OpDMul {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "dmul")?;
if self.rnd_mode != FRndMode::NearestEven {
write!(f, "{}", self.rnd_mode)?;
}
write!(f, " {} {}", self.srcs[0], self.srcs[1],)
}
}
impl_display_for_op!(OpDMul);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpDFma {
#[dst_type(F64)]
pub dst: Dst,
#[src_type(F64)]
pub srcs: [Src; 3],
pub rnd_mode: FRndMode,
}
impl DisplayOp for OpDFma {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "dfma")?;
if self.rnd_mode != FRndMode::NearestEven {
write!(f, "{}", self.rnd_mode)?;
}
write!(f, " {} {} {}", self.srcs[0], self.srcs[1], self.srcs[2])
}
}
impl_display_for_op!(OpDFma);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpDMnMx {
#[dst_type(F64)]
pub dst: Dst,
#[src_type(F64)]
pub srcs: [Src; 2],
#[src_type(Pred)]
pub min: Src,
}
impl DisplayOp for OpDMnMx {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "dmnmx {} {} {}", self.srcs[0], self.srcs[1], self.min)
}
}
impl_display_for_op!(OpDMnMx);
#[repr(C)]
#[derive(Clone, SrcsAsSlice, DstsAsSlice)]
pub struct OpDSetP {
#[dst_type(Pred)]
pub dst: Dst,
pub set_op: PredSetOp,
pub cmp_op: FloatCmpOp,
#[src_type(F64)]
pub srcs: [Src; 2],
#[src_type(Pred)]
pub accum: Src,
}
impl Foldable for OpDSetP {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let a = f.get_f64_src(self, &self.srcs[0]);
let b = f.get_f64_src(self, &self.srcs[1]);
let accum = f.get_pred_src(self, &self.accum);
let ordered = !a.is_nan() && !b.is_nan();
let cmp_res = match self.cmp_op {
FloatCmpOp::OrdEq => ordered && a == b,
FloatCmpOp::OrdNe => ordered && a != b,
FloatCmpOp::OrdLt => ordered && a < b,
FloatCmpOp::OrdLe => ordered && a <= b,
FloatCmpOp::OrdGt => ordered && a > b,
FloatCmpOp::OrdGe => ordered && a >= b,
FloatCmpOp::UnordEq => !ordered || a == b,
FloatCmpOp::UnordNe => !ordered || a != b,
FloatCmpOp::UnordLt => !ordered || a < b,
FloatCmpOp::UnordLe => !ordered || a <= b,
FloatCmpOp::UnordGt => !ordered || a > b,
FloatCmpOp::UnordGe => !ordered || a >= b,
FloatCmpOp::IsNum => ordered,
FloatCmpOp::IsNan => !ordered,
};
let res = self.set_op.eval(cmp_res, accum);
f.set_pred_dst(self, &self.dst, res);
}
}
impl DisplayOp for OpDSetP {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "dsetp{}", self.cmp_op)?;
if !self.set_op.is_trivial(&self.accum) {
write!(f, "{}", self.set_op)?;
}
write!(f, " {} {}", self.srcs[0], self.srcs[1])?;
if !self.set_op.is_trivial(&self.accum) {
write!(f, " {}", self.accum)?;
}
Ok(())
}
}
impl_display_for_op!(OpDSetP);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpHAdd2 {
#[dst_type(F16v2)]
pub dst: Dst,
#[src_type(F16v2)]
pub srcs: [Src; 2],
pub saturate: bool,
pub ftz: bool,
pub f32: bool,
}
impl DisplayOp for OpHAdd2 {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let sat = if self.saturate { ".sat" } else { "" };
let f32 = if self.f32 { ".f32" } else { "" };
write!(f, "hadd2{sat}{f32}")?;
if self.ftz {
write!(f, ".ftz")?;
}
write!(f, " {} {}", self.srcs[0], self.srcs[1])
}
}
impl_display_for_op!(OpHAdd2);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpHSet2 {
#[dst_type(F16v2)]
pub dst: Dst,
pub set_op: PredSetOp,
pub cmp_op: FloatCmpOp,
#[src_type(F16v2)]
pub srcs: [Src; 2],
#[src_type(Pred)]
pub accum: Src,
pub ftz: bool,
}
impl DisplayOp for OpHSet2 {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let ftz = if self.ftz { ".ftz" } else { "" };
write!(f, "hset2{}{ftz}", self.cmp_op)?;
if !self.set_op.is_trivial(&self.accum) {
write!(f, "{}", self.set_op)?;
}
write!(f, " {} {}", self.srcs[0], self.srcs[1])?;
if !self.set_op.is_trivial(&self.accum) {
write!(f, " {}", self.accum)?;
}
Ok(())
}
}
impl_display_for_op!(OpHSet2);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpHSetP2 {
#[dst_type(Pred)]
pub dsts: [Dst; 2],
pub set_op: PredSetOp,
pub cmp_op: FloatCmpOp,
#[src_type(F16v2)]
pub srcs: [Src; 2],
#[src_type(Pred)]
pub accum: Src,
pub ftz: bool,
// When not set, each dsts get the result of each lanes.
// When set, the first dst gets the result of both lanes (res0 && res1)
// and the second dst gets the negation !(res0 && res1)
// before applying the accumulator.
pub horizontal: bool,
}
impl DisplayOp for OpHSetP2 {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let ftz = if self.ftz { ".ftz" } else { "" };
write!(f, "hsetp2{}{ftz}", self.cmp_op)?;
if !self.set_op.is_trivial(&self.accum) {
write!(f, "{}", self.set_op)?;
}
write!(f, " {} {}", self.srcs[0], self.srcs[1])?;
if !self.set_op.is_trivial(&self.accum) {
write!(f, " {}", self.accum)?;
}
Ok(())
}
}
impl_display_for_op!(OpHSetP2);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpHMul2 {
#[dst_type(F16v2)]
pub dst: Dst,
#[src_type(F16v2)]
pub srcs: [Src; 2],
pub saturate: bool,
pub ftz: bool,
pub dnz: bool,
}
impl DisplayOp for OpHMul2 {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let sat = if self.saturate { ".sat" } else { "" };
write!(f, "hmul2{sat}")?;
if self.dnz {
write!(f, ".dnz")?;
} else if self.ftz {
write!(f, ".ftz")?;
}
write!(f, " {} {}", self.srcs[0], self.srcs[1])
}
}
impl_display_for_op!(OpHMul2);
#[derive(Clone, Copy, Eq, PartialEq)]
#[allow(dead_code)]
pub enum ImmaSize {
M8N8K16,
M8N8K32,
M16N8K16,
M16N8K32,
M16N8K64,
}
impl fmt::Display for ImmaSize {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
ImmaSize::M8N8K16 => write!(f, ".m8n8k16"),
ImmaSize::M8N8K32 => write!(f, ".m8n8k32"),
ImmaSize::M16N8K16 => write!(f, ".m16n8k16"),
ImmaSize::M16N8K32 => write!(f, ".m16n8k32"),
ImmaSize::M16N8K64 => write!(f, ".m16n8k64"),
}
}
}
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpImma {
#[dst_type(Vec)]
pub dst: Dst,
pub mat_size: ImmaSize,
pub src_types: [IntType; 2],
pub saturate: bool,
#[src_type(SSA)]
pub srcs: [Src; 3],
}
impl DisplayOp for OpImma {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let sat = if self.saturate { ".sat" } else { "" };
write!(
f,
"imma{}{}{}{sat} {} {} {}",
self.mat_size,
self.src_types[0],
self.src_types[1],
self.srcs[0],
self.srcs[1],
self.srcs[2],
)
}
}
impl_display_for_op!(OpImma);
#[derive(Clone, Copy, Eq, PartialEq)]
#[allow(dead_code)]
pub enum HmmaSize {
M16N8K16,
M16N8K8,
M16N8K4,
}
impl fmt::Display for HmmaSize {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
HmmaSize::M16N8K16 => write!(f, ".m16n8k16"),
HmmaSize::M16N8K8 => write!(f, ".m16n8k8"),
HmmaSize::M16N8K4 => write!(f, ".m16n8k4"),
}
}
}
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpHmma {
#[dst_type(Vec)]
pub dst: Dst,
pub mat_size: HmmaSize,
pub src_type: FloatType,
pub dst_type: FloatType,
#[src_type(SSA)]
pub srcs: [Src; 3],
}
impl DisplayOp for OpHmma {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"hmma{}{} {} {} {}",
self.mat_size,
self.dst_type,
self.srcs[0],
self.srcs[1],
self.srcs[2],
)
}
}
impl_display_for_op!(OpHmma);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpHFma2 {
#[dst_type(F16v2)]
pub dst: Dst,
#[src_type(F16v2)]
pub srcs: [Src; 3],
pub saturate: bool,
pub ftz: bool,
pub dnz: bool,
pub f32: bool,
}
impl DisplayOp for OpHFma2 {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let sat = if self.saturate { ".sat" } else { "" };
let f32 = if self.f32 { ".f32" } else { "" };
write!(f, "hfma2{sat}{f32}")?;
if self.dnz {
write!(f, ".dnz")?;
} else if self.ftz {
write!(f, ".ftz")?;
}
write!(f, " {} {} {}", self.srcs[0], self.srcs[1], self.srcs[2])
}
}
impl_display_for_op!(OpHFma2);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpHMnMx2 {
#[dst_type(F16v2)]
pub dst: Dst,
#[src_type(F16v2)]
pub srcs: [Src; 2],
#[src_type(Pred)]
pub min: Src,
pub ftz: bool,
}
impl DisplayOp for OpHMnMx2 {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let ftz = if self.ftz { ".ftz" } else { "" };
write!(
f,
"hmnmx2{ftz} {} {} {}",
self.srcs[0], self.srcs[1], self.min
)
}
}
impl_display_for_op!(OpHMnMx2);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpBMsk {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(ALU)]
pub pos: Src,
#[src_type(ALU)]
pub width: Src,
pub wrap: bool,
}
impl DisplayOp for OpBMsk {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let wrap = if self.wrap { ".wrap" } else { ".clamp" };
write!(f, "bmsk{} {} {}", wrap, self.pos, self.width)
}
}
impl_display_for_op!(OpBMsk);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpBRev {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(ALU)]
pub src: Src,
}
impl DisplayOp for OpBRev {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "brev {}", self.src)
}
}
impl_display_for_op!(OpBRev);
/// Bitfield extract. Extracts all bits from `base` starting at `offset` into
/// `dst`.
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpBfe {
/// Where to insert the bits.
#[dst_type(GPR)]
pub dst: Dst,
/// The source of bits to extract.
#[src_type(ALU)]
pub base: Src,
/// The range of bits to extract. This source is interpreted as four
/// separate bytes, [b0, b1, b2, b3].
///
/// b0 and b1: unused
/// b2: the number of bits to extract.
/// b3: the offset of the first bit to extract.
///
/// This matches the way the hardware works.
#[src_type(ALU)]
pub range: Src,
/// Whether the output is signed
pub signed: bool,
/// Whether to reverse the bits before inserting them into `dst`.
pub reverse: bool,
}
impl DisplayOp for OpBfe {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "bfe")?;
if self.signed {
write!(f, ".s")?;
}
if self.reverse {
write!(f, ".rev")?;
}
write!(f, " {} {}", self.base, self.range,)
}
}
impl_display_for_op!(OpBfe);
#[repr(C)]
#[derive(Clone, SrcsAsSlice, DstsAsSlice)]
pub struct OpFlo {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(ALU)]
pub src: Src,
pub signed: bool,
pub return_shift_amount: bool,
}
impl Foldable for OpFlo {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let src = f.get_u32_src(self, &self.src);
let leading = if self.signed && (src & 0x80000000) != 0 {
(!src).leading_zeros()
} else {
src.leading_zeros()
};
let dst = if self.return_shift_amount {
leading
} else {
31 - leading
};
f.set_u32_dst(self, &self.dst, dst);
}
}
impl DisplayOp for OpFlo {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "flo")?;
if self.return_shift_amount {
write!(f, ".samt")?;
}
write!(f, " {}", self.src)
}
}
impl_display_for_op!(OpFlo);
#[repr(C)]
#[derive(Clone, SrcsAsSlice, DstsAsSlice)]
pub struct OpIAbs {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(ALU)]
pub src: Src,
}
impl Foldable for OpIAbs {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let src = f.get_u32_src(self, &self.src);
let dst = (src as i32).unsigned_abs();
f.set_u32_dst(self, &self.dst, dst);
}
}
impl DisplayOp for OpIAbs {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "iabs {}", self.src)
}
}
impl_display_for_op!(OpIAbs);
/// Only used on SM50
#[repr(C)]
#[derive(Clone, SrcsAsSlice, DstsAsSlice)]
pub struct OpIAdd2 {
#[dst_type(GPR)]
pub dst: Dst,
#[dst_type(Carry)]
pub carry_out: Dst,
#[src_type(I32)]
pub srcs: [Src; 2],
}
impl Foldable for OpIAdd2 {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let srcs = [
f.get_u32_src(self, &self.srcs[0]),
f.get_u32_src(self, &self.srcs[1]),
];
let mut sum = 0_u64;
for i in 0..2 {
if self.srcs[i].src_mod.is_ineg() {
// This is a very literal interpretation of 2's compliment.
// This is not -u64::from(src) or u64::from(-src).
sum += u64::from(!srcs[i]) + 1;
} else {
sum += u64::from(srcs[i]);
}
}
f.set_u32_dst(self, &self.dst, sum as u32);
f.set_carry_dst(self, &self.carry_out, sum >= (1 << 32));
}
}
impl DisplayOp for OpIAdd2 {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "iadd2 {} {}", self.srcs[0], self.srcs[1])
}
}
/// Only used on SM50
#[repr(C)]
#[derive(Clone, SrcsAsSlice, DstsAsSlice)]
pub struct OpIAdd2X {
#[dst_type(GPR)]
pub dst: Dst,
#[dst_type(Carry)]
pub carry_out: Dst,
#[src_type(B32)]
pub srcs: [Src; 2],
#[src_type(Carry)]
pub carry_in: Src,
}
impl Foldable for OpIAdd2X {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let srcs = [
f.get_u32_bnot_src(self, &self.srcs[0]),
f.get_u32_bnot_src(self, &self.srcs[1]),
];
let carry_in = f.get_carry_src(self, &self.carry_in);
let sum = u64::from(srcs[0]) + u64::from(srcs[1]) + u64::from(carry_in);
f.set_u32_dst(self, &self.dst, sum as u32);
f.set_carry_dst(self, &self.carry_out, sum >= (1 << 32));
}
}
impl DisplayOp for OpIAdd2X {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "iadd2.x {} {}", self.srcs[0], self.srcs[1])?;
if !self.carry_in.is_zero() {
write!(f, " {}", self.carry_in)?;
}
Ok(())
}
}
#[repr(C)]
#[derive(Clone, SrcsAsSlice, DstsAsSlice)]
pub struct OpIAdd3 {
#[dst_type(GPR)]
pub dst: Dst,
#[dst_type(Pred)]
pub overflow: [Dst; 2],
#[src_type(I32)]
pub srcs: [Src; 3],
}
impl Foldable for OpIAdd3 {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let srcs = [
f.get_u32_src(self, &self.srcs[0]),
f.get_u32_src(self, &self.srcs[1]),
f.get_u32_src(self, &self.srcs[2]),
];
let mut sum = 0_u64;
for i in 0..3 {
if self.srcs[i].src_mod.is_ineg() {
// This is a very literal interpretation of 2's compliment.
// This is not -u64::from(src) or u64::from(-src).
sum += u64::from(!srcs[i]) + 1;
} else {
sum += u64::from(srcs[i]);
}
}
f.set_u32_dst(self, &self.dst, sum as u32);
f.set_pred_dst(self, &self.overflow[0], sum >= 1_u64 << 32);
f.set_pred_dst(self, &self.overflow[1], sum >= 2_u64 << 32);
}
}
impl DisplayOp for OpIAdd3 {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"iadd3 {} {} {}",
self.srcs[0], self.srcs[1], self.srcs[2],
)
}
}
impl_display_for_op!(OpIAdd3);
#[repr(C)]
#[derive(Clone, SrcsAsSlice, DstsAsSlice)]
pub struct OpIAdd3X {
#[dst_type(GPR)]
pub dst: Dst,
#[dst_type(Pred)]
pub overflow: [Dst; 2],
#[src_type(B32)]
pub srcs: [Src; 3],
#[src_type(Pred)]
pub carry: [Src; 2],
}
impl Foldable for OpIAdd3X {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let srcs = [
f.get_u32_bnot_src(self, &self.srcs[0]),
f.get_u32_bnot_src(self, &self.srcs[1]),
f.get_u32_bnot_src(self, &self.srcs[2]),
];
let carry = [
f.get_pred_src(self, &self.carry[0]),
f.get_pred_src(self, &self.carry[1]),
];
let mut sum = 0_u64;
for i in 0..3 {
sum += u64::from(srcs[i]);
}
for i in 0..2 {
sum += u64::from(carry[i]);
}
f.set_u32_dst(self, &self.dst, sum as u32);
f.set_pred_dst(self, &self.overflow[0], sum >= 1_u64 << 32);
f.set_pred_dst(self, &self.overflow[1], sum >= 2_u64 << 32);
}
}
impl DisplayOp for OpIAdd3X {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"iadd3.x {} {} {} {} {}",
self.srcs[0],
self.srcs[1],
self.srcs[2],
self.carry[0],
self.carry[1]
)
}
}
impl_display_for_op!(OpIAdd3X);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpIDp4 {
#[dst_type(GPR)]
pub dst: Dst,
pub src_types: [IntType; 2],
#[src_type(I32)]
pub srcs: [Src; 3],
}
impl DisplayOp for OpIDp4 {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"idp4{}{} {} {} {}",
self.src_types[0],
self.src_types[1],
self.srcs[0],
self.srcs[1],
self.srcs[2],
)
}
}
impl_display_for_op!(OpIDp4);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpIMad {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(ALU)]
pub srcs: [Src; 3],
pub signed: bool,
}
impl DisplayOp for OpIMad {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "imad {} {} {}", self.srcs[0], self.srcs[1], self.srcs[2],)
}
}
impl_display_for_op!(OpIMad);
/// Only used on SM50
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpIMul {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(ALU)]
pub srcs: [Src; 2],
pub signed: [bool; 2],
pub high: bool,
}
impl DisplayOp for OpIMul {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "imul")?;
if self.high {
write!(f, ".hi")?;
}
let src_type = |signed| if signed { ".s32" } else { ".u32" };
write!(
f,
"{}{}",
src_type(self.signed[0]),
src_type(self.signed[1])
)?;
write!(f, " {} {}", self.srcs[0], self.srcs[1])
}
}
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpIMad64 {
#[dst_type(Vec)]
pub dst: Dst,
#[src_type(ALU)]
pub srcs: [Src; 3],
pub signed: bool,
}
impl DisplayOp for OpIMad64 {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"imad64 {} {} {}",
self.srcs[0], self.srcs[1], self.srcs[2],
)
}
}
impl_display_for_op!(OpIMad64);
#[repr(C)]
#[derive(Clone, SrcsAsSlice, DstsAsSlice)]
pub struct OpIMnMx {
#[dst_type(GPR)]
pub dst: Dst,
pub cmp_type: IntCmpType,
#[src_type(ALU)]
pub srcs: [Src; 2],
#[src_type(Pred)]
pub min: Src,
}
impl Foldable for OpIMnMx {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let (a, b) = (
f.get_u32_bnot_src(self, &self.srcs[0]),
f.get_u32_bnot_src(self, &self.srcs[1]),
);
let min = f.get_pred_src(self, &self.min);
let res = match (min, self.cmp_type) {
(true, IntCmpType::U32) => a.min(b),
(true, IntCmpType::I32) => (a as i32).min(b as i32) as u32,
(false, IntCmpType::U32) => a.max(b),
(false, IntCmpType::I32) => (a as i32).max(b as i32) as u32,
};
f.set_u32_dst(self, &self.dst, res);
}
}
impl DisplayOp for OpIMnMx {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"imnmx{} {} {} {}",
self.cmp_type, self.srcs[0], self.srcs[1], self.min
)
}
}
impl_display_for_op!(OpIMnMx);
#[repr(C)]
#[derive(Clone, SrcsAsSlice, DstsAsSlice)]
pub struct OpISetP {
#[dst_type(Pred)]
pub dst: Dst,
pub set_op: PredSetOp,
pub cmp_op: IntCmpOp,
pub cmp_type: IntCmpType,
pub ex: bool,
#[src_type(ALU)]
pub srcs: [Src; 2],
#[src_type(Pred)]
pub accum: Src,
#[src_type(Pred)]
pub low_cmp: Src,
}
impl Foldable for OpISetP {
fn fold(&self, sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let x = f.get_u32_src(self, &self.srcs[0]);
let y = f.get_u32_src(self, &self.srcs[1]);
let accum = f.get_pred_src(self, &self.accum);
let low_cmp = f.get_pred_src(self, &self.low_cmp);
let cmp = if self.cmp_type.is_signed() {
let x = x as i32;
let y = y as i32;
match &self.cmp_op {
IntCmpOp::False => false,
IntCmpOp::True => true,
IntCmpOp::Eq => x == y,
IntCmpOp::Ne => x != y,
IntCmpOp::Lt => x < y,
IntCmpOp::Le => x <= y,
IntCmpOp::Gt => x > y,
IntCmpOp::Ge => x >= y,
}
} else {
match &self.cmp_op {
IntCmpOp::False => false,
IntCmpOp::True => true,
IntCmpOp::Eq => x == y,
IntCmpOp::Ne => x != y,
IntCmpOp::Lt => x < y,
IntCmpOp::Le => x <= y,
IntCmpOp::Gt => x > y,
IntCmpOp::Ge => x >= y,
}
};
let cmp_op_is_const =
matches!(self.cmp_op, IntCmpOp::False | IntCmpOp::True);
let cmp = if self.ex && x == y && !cmp_op_is_const {
// Pre-Volta, isetp.x takes the accumulator into account. If we
// want to support this, we need to take an an accumulator into
// account. Disallow it for now.
assert!(sm.sm() >= 70);
low_cmp
} else {
cmp
};
let dst = self.set_op.eval(cmp, accum);
f.set_pred_dst(self, &self.dst, dst);
}
}
impl DisplayOp for OpISetP {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "isetp{}{}", self.cmp_op, self.cmp_type)?;
if !self.set_op.is_trivial(&self.accum) {
write!(f, "{}", self.set_op)?;
}
if self.ex {
write!(f, ".ex")?;
}
write!(f, " {} {}", self.srcs[0], self.srcs[1])?;
if !self.set_op.is_trivial(&self.accum) {
write!(f, " {}", self.accum)?;
}
if self.ex {
write!(f, " {}", self.low_cmp)?;
}
Ok(())
}
}
impl_display_for_op!(OpISetP);
#[repr(C)]
#[derive(Clone, SrcsAsSlice, DstsAsSlice)]
pub struct OpLea {
#[dst_type(GPR)]
pub dst: Dst,
#[dst_type(Pred)]
pub overflow: Dst,
#[src_type(ALU)]
pub a: Src,
#[src_type(I32)]
pub b: Src,
#[src_type(ALU)]
pub a_high: Src, // High 32-bits of a if .dst_high is set
pub shift: u8,
pub dst_high: bool,
pub intermediate_mod: SrcMod, // Modifier for shifted temporary (a << shift)
}
impl Foldable for OpLea {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let a = f.get_u32_src(self, &self.a);
let mut b = f.get_u32_src(self, &self.b);
let a_high = f.get_u32_src(self, &self.a_high);
let mut overflow = false;
let mut shift_result = if self.dst_high {
let a = a as u64;
let a_high = a_high as u64;
let a = (a_high << 32) | a;
(a >> (32 - self.shift)) as u32
} else {
a << self.shift
};
if self.intermediate_mod.is_ineg() {
let o;
(shift_result, o) = u32::overflowing_add(!shift_result, 1);
overflow |= o;
}
if self.b.src_mod.is_ineg() {
let o;
(b, o) = u32::overflowing_add(!b, 1);
overflow |= o;
}
let (dst, o) = u32::overflowing_add(shift_result, b);
overflow |= o;
f.set_u32_dst(self, &self.dst, dst);
f.set_pred_dst(self, &self.overflow, overflow);
}
}
impl DisplayOp for OpLea {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "lea")?;
if self.dst_high {
write!(f, ".hi")?;
}
write!(f, " {} {} {}", self.a, self.shift, self.b)?;
if self.dst_high {
write!(f, " {}", self.a_high)?;
}
Ok(())
}
}
impl_display_for_op!(OpLea);
#[repr(C)]
#[derive(Clone, SrcsAsSlice, DstsAsSlice)]
pub struct OpLeaX {
#[dst_type(GPR)]
pub dst: Dst,
#[dst_type(Pred)]
pub overflow: Dst,
#[src_type(ALU)]
pub a: Src,
#[src_type(B32)]
pub b: Src,
#[src_type(ALU)]
pub a_high: Src, // High 32-bits of a if .dst_high is set
#[src_type(Pred)]
pub carry: Src,
pub shift: u8,
pub dst_high: bool,
pub intermediate_mod: SrcMod, // Modifier for shifted temporary (a << shift)
}
impl Foldable for OpLeaX {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let a = f.get_u32_src(self, &self.a);
let mut b = f.get_u32_src(self, &self.b);
let a_high = f.get_u32_src(self, &self.a_high);
let carry = f.get_pred_src(self, &self.carry);
let mut overflow = false;
let mut shift_result = if self.dst_high {
let a = a as u64;
let a_high = a_high as u64;
let a = (a_high << 32) | a;
(a >> (32 - self.shift)) as u32
} else {
a << self.shift
};
if self.intermediate_mod.is_bnot() {
shift_result = !shift_result;
}
if self.b.src_mod.is_bnot() {
b = !b;
}
let (dst, o) = u32::overflowing_add(shift_result, b);
overflow |= o;
let (dst, o) = u32::overflowing_add(dst, if carry { 1 } else { 0 });
overflow |= o;
f.set_u32_dst(self, &self.dst, dst);
f.set_pred_dst(self, &self.overflow, overflow);
}
}
impl DisplayOp for OpLeaX {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "lea.x")?;
if self.dst_high {
write!(f, ".hi")?;
}
write!(f, " {} {} {}", self.a, self.shift, self.b)?;
if self.dst_high {
write!(f, " {}", self.a_high)?;
}
write!(f, " {}", self.carry)?;
Ok(())
}
}
impl_display_for_op!(OpLeaX);
#[repr(C)]
#[derive(Clone, SrcsAsSlice, DstsAsSlice)]
pub struct OpLop2 {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(B32)]
pub srcs: [Src; 2],
pub op: LogicOp2,
}
impl DisplayOp for OpLop2 {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "lop2.{} {} {}", self.op, self.srcs[0], self.srcs[1],)
}
}
impl Foldable for OpLop2 {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let srcs = [
f.get_u32_bnot_src(self, &self.srcs[0]),
f.get_u32_bnot_src(self, &self.srcs[1]),
];
let dst = match self.op {
LogicOp2::And => srcs[0] & srcs[1],
LogicOp2::Or => srcs[0] | srcs[1],
LogicOp2::Xor => srcs[0] ^ srcs[1],
LogicOp2::PassB => srcs[1],
};
f.set_u32_dst(self, &self.dst, dst);
}
}
#[repr(C)]
#[derive(Clone, SrcsAsSlice, DstsAsSlice)]
pub struct OpLop3 {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(ALU)]
pub srcs: [Src; 3],
pub op: LogicOp3,
}
impl Foldable for OpLop3 {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let srcs = [
f.get_u32_bnot_src(self, &self.srcs[0]),
f.get_u32_bnot_src(self, &self.srcs[1]),
f.get_u32_bnot_src(self, &self.srcs[2]),
];
let dst = self.op.eval(srcs[0], srcs[1], srcs[2]);
f.set_u32_dst(self, &self.dst, dst);
}
}
impl DisplayOp for OpLop3 {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"lop3.{} {} {} {}",
self.op, self.srcs[0], self.srcs[1], self.srcs[2],
)
}
}
impl_display_for_op!(OpLop3);
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum ShflOp {
Idx,
Up,
Down,
Bfly,
}
impl fmt::Display for ShflOp {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
ShflOp::Idx => write!(f, "idx"),
ShflOp::Up => write!(f, "up"),
ShflOp::Down => write!(f, "down"),
ShflOp::Bfly => write!(f, "bfly"),
}
}
}
#[repr(C)]
#[derive(Clone, SrcsAsSlice, DstsAsSlice)]
pub struct OpShf {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(GPR)]
pub low: Src,
#[src_type(ALU)]
pub high: Src,
#[src_type(ALU)]
pub shift: Src,
pub right: bool,
pub wrap: bool,
pub data_type: IntType,
pub dst_high: bool,
}
fn reduce_shift_imm(shift: &mut Src, wrap: bool, bits: u32) {
debug_assert!(shift.src_mod.is_none());
if let SrcRef::Imm32(shift) = &mut shift.src_ref {
if wrap {
*shift = *shift & (bits - 1);
} else {
*shift = std::cmp::min(*shift, bits)
}
}
}
impl OpShf {
/// Reduces the shift immediate, if any. Out-of-range shifts are either
/// clamped to the maximum or wrapped as needed.
pub fn reduce_shift_imm(&mut self) {
let bits = self.data_type.bits().try_into().unwrap();
reduce_shift_imm(&mut self.shift, self.wrap, bits);
}
}
impl Foldable for OpShf {
fn fold(&self, sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let low = f.get_u32_src(self, &self.low);
let high = f.get_u32_src(self, &self.high);
let shift = f.get_u32_src(self, &self.shift);
let bits: u32 = self.data_type.bits().try_into().unwrap();
let shift = if self.wrap {
shift & (bits - 1)
} else {
min(shift, bits)
};
let x = u64::from(low) | (u64::from(high) << 32);
let shifted = if sm.sm() < 70
&& self.dst_high
&& self.data_type != IntType::I64
{
if self.right {
x.checked_shr(shift).unwrap_or(0)
} else {
x.checked_shl(shift).unwrap_or(0)
}
} else if self.data_type.is_signed() {
if self.right {
let x = x as i64;
x.checked_shr(shift).unwrap_or(x >> 63) as u64
} else {
x.checked_shl(shift).unwrap_or(0)
}
} else {
if self.right {
x.checked_shr(shift).unwrap_or(0)
} else {
x.checked_shl(shift).unwrap_or(0)
}
};
let dst = if (sm.sm() < 70 && !self.right) || self.dst_high {
(shifted >> 32) as u32
} else {
shifted as u32
};
f.set_u32_dst(self, &self.dst, dst);
}
}
impl DisplayOp for OpShf {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "shf")?;
if self.right {
write!(f, ".r")?;
} else {
write!(f, ".l")?;
}
if self.wrap {
write!(f, ".w")?;
}
write!(f, "{}", self.data_type)?;
if self.dst_high {
write!(f, ".hi")?;
}
write!(f, " {} {} {}", self.low, self.high, self.shift)
}
}
impl_display_for_op!(OpShf);
/// Only used on SM50
#[repr(C)]
#[derive(Clone, SrcsAsSlice, DstsAsSlice)]
pub struct OpShl {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(GPR)]
pub src: Src,
#[src_type(ALU)]
pub shift: Src,
pub wrap: bool,
}
impl OpShl {
/// Reduces the shift immediate, if any. Out-of-range shifts are either
/// clamped to the maximum or wrapped as needed.
pub fn reduce_shift_imm(&mut self) {
reduce_shift_imm(&mut self.shift, self.wrap, 32);
}
}
impl DisplayOp for OpShl {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "shl")?;
if self.wrap {
write!(f, ".w")?;
}
write!(f, " {} {}", self.src, self.shift)
}
}
impl Foldable for OpShl {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let x = f.get_u32_src(self, &self.src);
let shift = f.get_u32_src(self, &self.shift);
let shift = if self.wrap {
shift & 31
} else {
min(shift, 32)
};
let dst = x.checked_shl(shift).unwrap_or(0);
f.set_u32_dst(self, &self.dst, dst);
}
}
/// Only used on SM50
#[repr(C)]
#[derive(Clone, SrcsAsSlice, DstsAsSlice)]
pub struct OpShr {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(GPR)]
pub src: Src,
#[src_type(ALU)]
pub shift: Src,
pub wrap: bool,
pub signed: bool,
}
impl DisplayOp for OpShr {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "shr")?;
if self.wrap {
write!(f, ".w")?;
}
if !self.signed {
write!(f, ".u32")?;
}
write!(f, " {} {}", self.src, self.shift)
}
}
impl OpShr {
/// Reduces the shift immediate, if any. Out-of-range shifts are either
/// clamped to the maximum or wrapped as needed.
pub fn reduce_shift_imm(&mut self) {
reduce_shift_imm(&mut self.shift, self.wrap, 32);
}
}
impl Foldable for OpShr {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let x = f.get_u32_src(self, &self.src);
let shift = f.get_u32_src(self, &self.shift);
let shift = if self.wrap {
shift & 31
} else {
min(shift, 32)
};
let dst = if self.signed {
let x = x as i32;
x.checked_shr(shift).unwrap_or(x >> 31) as u32
} else {
x.checked_shr(shift).unwrap_or(0)
};
f.set_u32_dst(self, &self.dst, dst);
}
}
#[repr(C)]
pub struct OpF2F {
pub dst: Dst,
pub src: Src,
pub src_type: FloatType,
pub dst_type: FloatType,
pub rnd_mode: FRndMode,
pub ftz: bool,
/// For 16-bit up-conversions, take the high 16 bits of the source register.
/// For 16-bit down-conversions, place the result into the upper 16 bits of
/// the destination register
pub high: bool,
/// Round to the nearest integer rather than nearest float
///
/// Not available on SM70+
pub integer_rnd: bool,
}
impl AsSlice<Src> for OpF2F {
type Attr = SrcType;
fn as_slice(&self) -> &[Src] {
std::slice::from_ref(&self.src)
}
fn as_mut_slice(&mut self) -> &mut [Src] {
std::slice::from_mut(&mut self.src)
}
fn attrs(&self) -> SrcTypeList {
let src_type = match self.src_type {
FloatType::F16 => SrcType::F16,
FloatType::F32 => SrcType::F32,
FloatType::F64 => SrcType::F64,
};
SrcTypeList::Uniform(src_type)
}
}
impl AsSlice<Dst> for OpF2F {
type Attr = DstType;
fn as_slice(&self) -> &[Dst] {
std::slice::from_ref(&self.dst)
}
fn as_mut_slice(&mut self) -> &mut [Dst] {
std::slice::from_mut(&mut self.dst)
}
fn attrs(&self) -> DstTypeList {
let dst_type = match self.dst_type {
FloatType::F16 => DstType::F16,
FloatType::F32 => DstType::F32,
FloatType::F64 => DstType::F64,
};
DstTypeList::Uniform(dst_type)
}
}
impl DisplayOp for OpF2F {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "f2f")?;
if self.ftz {
write!(f, ".ftz")?;
}
if self.integer_rnd {
write!(f, ".int")?;
}
write!(
f,
"{}{}{} {}",
self.dst_type, self.src_type, self.rnd_mode, self.src,
)
}
}
impl_display_for_op!(OpF2F);
#[repr(C)]
#[derive(DstsAsSlice, SrcsAsSlice)]
pub struct OpF2FP {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(ALU)]
pub srcs: [Src; 2],
pub rnd_mode: FRndMode,
}
impl DisplayOp for OpF2FP {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "f2fp.pack_ab")?;
if self.rnd_mode != FRndMode::NearestEven {
write!(f, "{}", self.rnd_mode)?;
}
write!(f, " {}, {}", self.srcs[0], self.srcs[1],)
}
}
impl_display_for_op!(OpF2FP);
#[repr(C)]
#[derive(DstsAsSlice)]
pub struct OpF2I {
#[dst_type(GPR)]
pub dst: Dst,
pub src: Src,
pub src_type: FloatType,
pub dst_type: IntType,
pub rnd_mode: FRndMode,
pub ftz: bool,
}
impl AsSlice<Src> for OpF2I {
type Attr = SrcType;
fn as_slice(&self) -> &[Src] {
std::slice::from_ref(&self.src)
}
fn as_mut_slice(&mut self) -> &mut [Src] {
std::slice::from_mut(&mut self.src)
}
fn attrs(&self) -> SrcTypeList {
let src_type = match self.src_type {
FloatType::F16 => SrcType::F16,
FloatType::F32 => SrcType::F32,
FloatType::F64 => SrcType::F64,
};
SrcTypeList::Uniform(src_type)
}
}
impl DisplayOp for OpF2I {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let ftz = if self.ftz { ".ftz" } else { "" };
write!(
f,
"f2i{}{}{}{ftz} {}",
self.dst_type, self.src_type, self.rnd_mode, self.src,
)
}
}
impl_display_for_op!(OpF2I);
#[repr(C)]
pub struct OpI2F {
pub dst: Dst,
pub src: Src,
pub dst_type: FloatType,
pub src_type: IntType,
pub rnd_mode: FRndMode,
}
impl AsSlice<Src> for OpI2F {
type Attr = SrcType;
fn as_slice(&self) -> &[Src] {
std::slice::from_ref(&self.src)
}
fn as_mut_slice(&mut self) -> &mut [Src] {
std::slice::from_mut(&mut self.src)
}
fn attrs(&self) -> SrcTypeList {
if self.src_type.bits() <= 32 {
SrcTypeList::Uniform(SrcType::ALU)
} else {
SrcTypeList::Uniform(SrcType::GPR)
}
}
}
impl AsSlice<Dst> for OpI2F {
type Attr = DstType;
fn as_slice(&self) -> &[Dst] {
std::slice::from_ref(&self.dst)
}
fn as_mut_slice(&mut self) -> &mut [Dst] {
std::slice::from_mut(&mut self.dst)
}
fn attrs(&self) -> DstTypeList {
let dst_type = match self.dst_type {
FloatType::F16 => DstType::F16,
FloatType::F32 => DstType::F32,
FloatType::F64 => DstType::F64,
};
DstTypeList::Uniform(dst_type)
}
}
impl DisplayOp for OpI2F {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"i2f{}{}{} {}",
self.dst_type, self.src_type, self.rnd_mode, self.src,
)
}
}
impl_display_for_op!(OpI2F);
/// Not used on SM70+
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpI2I {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(ALU)]
pub src: Src,
pub src_type: IntType,
pub dst_type: IntType,
pub saturate: bool,
pub abs: bool,
pub neg: bool,
}
impl DisplayOp for OpI2I {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "i2i")?;
if self.saturate {
write!(f, ".sat ")?;
}
write!(f, "{}{} {}", self.dst_type, self.src_type, self.src,)?;
if self.abs {
write!(f, ".abs")?;
}
if self.neg {
write!(f, ".neg")?;
}
Ok(())
}
}
impl_display_for_op!(OpI2I);
#[repr(C)]
#[derive(DstsAsSlice)]
pub struct OpFRnd {
#[dst_type(F32)]
pub dst: Dst,
pub src: Src,
pub dst_type: FloatType,
pub src_type: FloatType,
pub rnd_mode: FRndMode,
pub ftz: bool,
}
impl AsSlice<Src> for OpFRnd {
type Attr = SrcType;
fn as_slice(&self) -> &[Src] {
std::slice::from_ref(&self.src)
}
fn as_mut_slice(&mut self) -> &mut [Src] {
std::slice::from_mut(&mut self.src)
}
fn attrs(&self) -> SrcTypeList {
let src_type = match self.src_type {
FloatType::F16 => SrcType::F16,
FloatType::F32 => SrcType::F32,
FloatType::F64 => SrcType::F64,
};
SrcTypeList::Uniform(src_type)
}
}
impl DisplayOp for OpFRnd {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let ftz = if self.ftz { ".ftz" } else { "" };
write!(
f,
"frnd{}{}{}{ftz} {}",
self.dst_type, self.src_type, self.rnd_mode, self.src,
)
}
}
impl_display_for_op!(OpFRnd);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpMov {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(ALU)]
pub src: Src,
pub quad_lanes: u8,
}
impl DisplayOp for OpMov {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.quad_lanes == 0xf {
write!(f, "mov {}", self.src)
} else {
write!(f, "mov[{:#x}] {}", self.quad_lanes, self.src)
}
}
}
impl_display_for_op!(OpMov);
#[derive(Copy, Clone)]
pub struct PrmtSelByte(u8);
impl PrmtSelByte {
pub const INVALID: PrmtSelByte = PrmtSelByte(u8::MAX);
pub fn new(src_idx: usize, byte_idx: usize, msb: bool) -> PrmtSelByte {
assert!(src_idx < 2);
assert!(byte_idx < 4);
let mut nib = 0;
nib |= (src_idx as u8) << 2;
nib |= byte_idx as u8;
if msb {
nib |= 0x8;
}
PrmtSelByte(nib)
}
pub fn src(&self) -> usize {
((self.0 >> 2) & 0x1).into()
}
pub fn byte(&self) -> usize {
(self.0 & 0x3).into()
}
pub fn msb(&self) -> bool {
(self.0 & 0x8) != 0
}
pub fn fold_u32(&self, u: u32) -> u8 {
let mut sb = (u >> (self.byte() * 8)) as u8;
if self.msb() {
sb = ((sb as i8) >> 7) as u8;
}
sb
}
}
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub struct PrmtSel(pub u16);
impl PrmtSel {
pub fn new(bytes: [PrmtSelByte; 4]) -> PrmtSel {
let mut sel = 0;
for i in 0..4 {
assert!(bytes[i].0 <= 0xf);
sel |= u16::from(bytes[i].0) << (i * 4);
}
PrmtSel(sel)
}
pub fn get(&self, byte_idx: usize) -> PrmtSelByte {
assert!(byte_idx < 4);
PrmtSelByte(((self.0 >> (byte_idx * 4)) & 0xf) as u8)
}
}
#[allow(dead_code)]
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub enum PrmtMode {
Index,
Forward4Extract,
Backward4Extract,
Replicate8,
EdgeClampLeft,
EdgeClampRight,
Replicate16,
}
impl fmt::Display for PrmtMode {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
PrmtMode::Index => Ok(()),
PrmtMode::Forward4Extract => write!(f, ".f4e"),
PrmtMode::Backward4Extract => write!(f, ".b4e"),
PrmtMode::Replicate8 => write!(f, ".rc8"),
PrmtMode::EdgeClampLeft => write!(f, ".ecl"),
PrmtMode::EdgeClampRight => write!(f, ".ecl"),
PrmtMode::Replicate16 => write!(f, ".rc16"),
}
}
}
#[repr(C)]
#[derive(Clone, SrcsAsSlice, DstsAsSlice)]
/// Permutes `srcs` into `dst` using `selection`.
pub struct OpPrmt {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(ALU)]
pub srcs: [Src; 2],
#[src_type(ALU)]
pub sel: Src,
pub mode: PrmtMode,
}
impl OpPrmt {
pub fn get_sel(&self) -> Option<PrmtSel> {
// TODO: We could construct a PrmtSel for the other modes but we don't
// use them right now because they're kinda pointless.
if self.mode != PrmtMode::Index {
return None;
}
self.sel.as_u32(SrcType::ALU).map(|sel| {
// The top 16 bits are ignored
PrmtSel(sel as u16)
})
}
/// Reduces the sel immediate, if any.
pub fn reduce_sel_imm(&mut self) {
assert!(self.sel.src_mod.is_none());
if let SrcRef::Imm32(sel) = &mut self.sel.src_ref {
// Only the bottom 16 bits matter anyway
*sel &= 0xffff;
}
}
pub fn as_u32(&self) -> Option<u32> {
let sel = self.get_sel()?;
let mut imm = 0_u32;
for b in 0..4 {
let sel_byte = sel.get(b);
let src_u32 = self.srcs[sel_byte.src()].as_u32(SrcType::ALU)?;
let sb = sel_byte.fold_u32(src_u32);
imm |= u32::from(sb) << (b * 8);
}
Some(imm)
}
}
impl Foldable for OpPrmt {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let srcs = [
f.get_u32_src(self, &self.srcs[0]),
f.get_u32_src(self, &self.srcs[1]),
];
let sel = f.get_u32_src(self, &self.sel);
assert!(self.mode == PrmtMode::Index);
let sel = PrmtSel(sel as u16);
let mut dst = 0_u32;
for b in 0..4 {
let sel_byte = sel.get(b);
let src = srcs[sel_byte.src()];
let sb = sel_byte.fold_u32(src);
dst |= u32::from(sb) << (b * 8);
}
f.set_u32_dst(self, &self.dst, dst);
}
}
impl DisplayOp for OpPrmt {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"prmt{} {} [{}] {}",
self.mode, self.srcs[0], self.sel, self.srcs[1],
)
}
}
impl_display_for_op!(OpPrmt);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpSel {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(Pred)]
pub cond: Src,
#[src_type(ALU)]
pub srcs: [Src; 2],
}
impl DisplayOp for OpSel {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "sel {} {} {}", self.cond, self.srcs[0], self.srcs[1],)
}
}
impl_display_for_op!(OpSel);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpShfl {
#[dst_type(GPR)]
pub dst: Dst,
#[dst_type(Pred)]
pub in_bounds: Dst,
#[src_type(SSA)]
pub src: Src,
#[src_type(ALU)]
pub lane: Src,
#[src_type(ALU)]
pub c: Src,
pub op: ShflOp,
}
impl OpShfl {
/// Reduces the lane and c immediates, if any. The hardware only uses
/// some of the bits of `lane` and `c` and ignores the rest. This method
/// masks off the unused bits and ensures that any immediate values fit
/// in the limited encoding space in the instruction.
pub fn reduce_lane_c_imm(&mut self) {
debug_assert!(self.lane.src_mod.is_none());
if let SrcRef::Imm32(lane) = &mut self.lane.src_ref {
*lane &= 0x1f;
}
debug_assert!(self.c.src_mod.is_none());
if let SrcRef::Imm32(c) = &mut self.c.src_ref {
*c &= 0x1f1f;
}
}
}
impl DisplayOp for OpShfl {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "shfl.{} {} {} {}", self.op, self.src, self.lane, self.c)
}
}
impl_display_for_op!(OpShfl);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpPLop3 {
#[dst_type(Pred)]
pub dsts: [Dst; 2],
#[src_type(Pred)]
pub srcs: [Src; 3],
pub ops: [LogicOp3; 2],
}
impl DisplayOp for OpPLop3 {
fn fmt_dsts(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{} {}", self.dsts[0], self.dsts[1])
}
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"plop3 {} {} {} {} {}",
self.srcs[0], self.srcs[1], self.srcs[2], self.ops[0], self.ops[1],
)
}
}
impl_display_for_op!(OpPLop3);
#[repr(C)]
#[derive(Clone, SrcsAsSlice, DstsAsSlice)]
pub struct OpPSetP {
#[dst_type(Pred)]
pub dsts: [Dst; 2],
pub ops: [PredSetOp; 2],
#[src_type(Pred)]
pub srcs: [Src; 3],
}
impl Foldable for OpPSetP {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let srcs = [
f.get_pred_src(self, &self.srcs[0]),
f.get_pred_src(self, &self.srcs[1]),
f.get_pred_src(self, &self.srcs[2]),
];
let tmp = self.ops[0].eval(srcs[0], srcs[1]);
let dst0 = self.ops[1].eval(srcs[2], tmp);
let tmp = self.ops[0].eval(!srcs[0], srcs[1]);
let dst1 = self.ops[1].eval(srcs[2], tmp);
f.set_pred_dst(self, &self.dsts[0], dst0);
f.set_pred_dst(self, &self.dsts[1], dst1);
}
}
impl DisplayOp for OpPSetP {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"psetp{}{} {} {} {}",
self.ops[0], self.ops[1], self.srcs[0], self.srcs[1], self.srcs[2],
)
}
}
#[repr(C)]
#[derive(Clone, SrcsAsSlice, DstsAsSlice)]
pub struct OpPopC {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(B32)]
pub src: Src,
}
impl Foldable for OpPopC {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let src = f.get_u32_bnot_src(self, &self.src);
let dst = src.count_ones();
f.set_u32_dst(self, &self.dst, dst);
}
}
impl DisplayOp for OpPopC {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "popc {}", self.src,)
}
}
impl_display_for_op!(OpPopC);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpR2UR {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(GPR)]
pub src: Src,
}
impl DisplayOp for OpR2UR {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "r2ur {}", self.src)
}
}
impl_display_for_op!(OpR2UR);
#[derive(Copy, Clone, PartialEq, Eq)]
pub enum ReduxOp {
And,
Or,
Xor,
Sum,
Min(IntCmpType),
Max(IntCmpType),
}
impl fmt::Display for ReduxOp {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
ReduxOp::And => write!(f, ".and"),
ReduxOp::Or => write!(f, ".or"),
ReduxOp::Xor => write!(f, ".xor"),
ReduxOp::Sum => write!(f, ".sum"),
ReduxOp::Min(cmp) => write!(f, ".min{cmp}"),
ReduxOp::Max(cmp) => write!(f, ".max{cmp}"),
}
}
}
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpRedux {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(GPR)]
pub src: Src,
pub op: ReduxOp,
}
impl DisplayOp for OpRedux {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "redux{} {}", self.op, self.src)
}
}
impl_display_for_op!(OpRedux);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpTex {
pub dsts: [Dst; 2],
pub fault: Dst,
pub tex: TexRef,
#[src_type(SSA)]
pub srcs: [Src; 2],
pub dim: TexDim,
pub lod_mode: TexLodMode,
pub deriv_mode: TexDerivMode,
pub z_cmpr: bool,
pub offset_mode: TexOffsetMode,
pub mem_eviction_priority: MemEvictionPriority,
pub nodep: bool,
pub channel_mask: ChannelMask,
}
impl DisplayOp for OpTex {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"tex{}{}{}{}",
self.dim, self.lod_mode, self.offset_mode, self.deriv_mode
)?;
if self.z_cmpr {
write!(f, ".dc")?;
}
write!(f, "{}", self.mem_eviction_priority)?;
if self.nodep {
write!(f, ".nodep")?;
}
write!(f, "{}", self.channel_mask)?;
write!(f, " {} {} {}", self.tex, self.srcs[0], self.srcs[1])
}
}
impl_display_for_op!(OpTex);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpTld {
pub dsts: [Dst; 2],
pub fault: Dst,
pub tex: TexRef,
#[src_type(SSA)]
pub srcs: [Src; 2],
pub dim: TexDim,
pub is_ms: bool,
pub lod_mode: TexLodMode,
pub offset_mode: TexOffsetMode,
pub mem_eviction_priority: MemEvictionPriority,
pub nodep: bool,
pub channel_mask: ChannelMask,
}
impl DisplayOp for OpTld {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "tld{}{}{}", self.dim, self.lod_mode, self.offset_mode)?;
if self.is_ms {
write!(f, ".ms")?;
}
write!(f, "{}", self.mem_eviction_priority)?;
if self.nodep {
write!(f, ".nodep")?;
}
write!(f, "{}", self.channel_mask)?;
write!(f, " {} {} {}", self.tex, self.srcs[0], self.srcs[1])
}
}
impl_display_for_op!(OpTld);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpTld4 {
pub dsts: [Dst; 2],
pub fault: Dst,
pub tex: TexRef,
#[src_type(SSA)]
pub srcs: [Src; 2],
pub dim: TexDim,
pub comp: u8,
pub offset_mode: TexOffsetMode,
pub z_cmpr: bool,
pub mem_eviction_priority: MemEvictionPriority,
pub nodep: bool,
pub channel_mask: ChannelMask,
}
impl DisplayOp for OpTld4 {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "tld4.g{}{}", self.dim, self.offset_mode)?;
if self.z_cmpr {
write!(f, ".dc")?;
}
write!(f, "{}", self.mem_eviction_priority)?;
if self.nodep {
write!(f, ".nodep")?;
}
write!(f, "{}", self.channel_mask)?;
write!(f, " {} {} {}", self.tex, self.srcs[0], self.srcs[1])
}
}
impl_display_for_op!(OpTld4);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpTmml {
pub dsts: [Dst; 2],
pub tex: TexRef,
#[src_type(SSA)]
pub srcs: [Src; 2],
pub dim: TexDim,
pub deriv_mode: TexDerivMode,
pub nodep: bool,
pub channel_mask: ChannelMask,
}
impl DisplayOp for OpTmml {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "tmml.lod{}{}", self.dim, self.deriv_mode)?;
if self.nodep {
write!(f, ".nodep")?;
}
write!(f, "{}", self.channel_mask)?;
write!(f, " {} {} {}", self.tex, self.srcs[0], self.srcs[1])
}
}
impl_display_for_op!(OpTmml);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpTxd {
pub dsts: [Dst; 2],
pub fault: Dst,
pub tex: TexRef,
#[src_type(SSA)]
pub srcs: [Src; 2],
pub dim: TexDim,
pub offset_mode: TexOffsetMode,
pub mem_eviction_priority: MemEvictionPriority,
pub nodep: bool,
pub channel_mask: ChannelMask,
}
impl DisplayOp for OpTxd {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"txd{}{}{}",
self.dim, self.offset_mode, self.mem_eviction_priority
)?;
if self.nodep {
write!(f, ".nodep")?;
}
write!(f, "{}", self.channel_mask)?;
write!(f, " {} {} {}", self.tex, self.srcs[0], self.srcs[1])
}
}
impl_display_for_op!(OpTxd);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpTxq {
pub dsts: [Dst; 2],
pub tex: TexRef,
#[src_type(SSA)]
pub src: Src,
pub query: TexQuery,
pub nodep: bool,
pub channel_mask: ChannelMask,
}
impl DisplayOp for OpTxq {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "txq")?;
if self.nodep {
write!(f, ".nodep")?;
}
write!(f, "{}", self.channel_mask)?;
write!(f, " {} {} {}", self.tex, self.src, self.query)
}
}
impl_display_for_op!(OpTxq);
#[allow(dead_code)]
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum ImageAccess {
Binary(MemType),
Formatted(ChannelMask),
}
impl fmt::Display for ImageAccess {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
ImageAccess::Binary(mem_type) => write!(f, ".b{mem_type}"),
ImageAccess::Formatted(mask) => write!(f, ".p{mask}"),
}
}
}
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpSuLd {
pub dst: Dst,
pub fault: Dst,
pub image_access: ImageAccess,
pub image_dim: ImageDim,
pub mem_order: MemOrder,
pub mem_eviction_priority: MemEvictionPriority,
#[src_type(SSA)]
pub handle: Src,
#[src_type(SSA)]
pub coord: Src,
}
impl DisplayOp for OpSuLd {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"suld{}{}{}{} [{}] {}",
self.image_access,
self.image_dim,
self.mem_order,
self.mem_eviction_priority,
self.coord,
self.handle,
)
}
}
impl_display_for_op!(OpSuLd);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpSuSt {
pub image_access: ImageAccess,
pub image_dim: ImageDim,
pub mem_order: MemOrder,
pub mem_eviction_priority: MemEvictionPriority,
#[src_type(SSA)]
pub handle: Src,
#[src_type(SSA)]
pub coord: Src,
#[src_type(SSA)]
pub data: Src,
}
impl DisplayOp for OpSuSt {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"sust{}{}{}{} [{}] {} {}",
self.image_access,
self.image_dim,
self.mem_order,
self.mem_eviction_priority,
self.coord,
self.data,
self.handle,
)
}
}
impl_display_for_op!(OpSuSt);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpSuAtom {
pub dst: Dst,
pub fault: Dst,
pub image_dim: ImageDim,
pub atom_op: AtomOp,
pub atom_type: AtomType,
pub mem_order: MemOrder,
pub mem_eviction_priority: MemEvictionPriority,
#[src_type(SSA)]
pub handle: Src,
#[src_type(SSA)]
pub coord: Src,
#[src_type(SSA)]
pub data: Src,
}
impl DisplayOp for OpSuAtom {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"suatom.p{}{}{}{}{} [{}] {} {}",
self.image_dim,
self.atom_op,
self.atom_type,
self.mem_order,
self.mem_eviction_priority,
self.coord,
self.data,
self.handle,
)
}
}
impl_display_for_op!(OpSuAtom);
#[derive(Clone, Copy)]
pub enum SuClampMode {
StoredInDescriptor,
PitchLinear,
BlockLinear,
}
impl fmt::Display for SuClampMode {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let s = match self {
SuClampMode::StoredInDescriptor => ".sd",
SuClampMode::PitchLinear => ".pl",
SuClampMode::BlockLinear => ".bl",
};
write!(f, "{}", s)
}
}
#[derive(Clone, Copy)]
pub enum SuClampRound {
R1,
R2,
R4,
R8,
R16,
}
impl SuClampRound {
pub fn to_int(&self) -> u8 {
match self {
SuClampRound::R1 => 1,
SuClampRound::R2 => 2,
SuClampRound::R4 => 4,
SuClampRound::R8 => 8,
SuClampRound::R16 => 16,
}
}
#[allow(dead_code)]
pub fn to_mask(&self) -> u32 {
!(self.to_int() as u32 - 1)
}
}
impl fmt::Display for SuClampRound {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, ".r{}", self.to_int())
}
}
/// Kepler only
/// Surface Clamp
///
/// Can clamp coordinates of surface operations in a 0..=clamp inclusive
/// range. It also computes other information useful to compute the
/// real address of an element within an image for both block-lienar and
/// pitch-linear layouts. We can also reduce this operation to a "stupid"
/// inclusive clamp by setting modifier Mode=PitchLinear and is_2d=false
/// this will not compute any extra operations and is useful to clamp array
/// indexes.
///
/// Since the shader code does not know if an image layout is block-linear
/// or pitch-linear, this opcode must be able to do both, the operation
/// is then selected by the "clamp" bitfield, usually read from a descriptor.
/// In block-linear mode we divide the bits that will compute the higher
/// part and the lower part.
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice, Clone)]
pub struct OpSuClamp {
#[dst_type(GPR)]
pub dst: Dst,
#[dst_type(Pred)]
pub out_of_bounds: Dst,
/// This modifier specifies if we use pitch-linear or block-linear
/// calculations, another option is to support both and read the actual
/// format from the clamp (shader code doesn't always know if an image
/// layout).
/// When mode=pitch_linear and is_2d=false the suclamp op enters a
/// simpler "plain" mode where it only performs clamping and the output
/// register doesn't contain any information bits about pitch-linear or
/// block-linear calculations
pub mode: SuClampMode,
/// Strangely enough, "round" just rounds the clamp, not the source
/// this does not help at all with clamping coordinates.
/// It could be useful when clamping raw addresses of a multi-byte read.
/// ex: if we read 4 bytes at once, and the buffer length is 16,
/// the bounds will be 15 (they are inclusive), but if we read
/// at address 15 we would read bytes 15..19, so we are out of range.
/// if we clamp tthe bounds to R4 the effective bound becomes 12
/// so the read will be performed from 12..16, remaining in bounds.
pub round: SuClampRound,
pub is_s32: bool,
pub is_2d: bool,
#[src_type(GPR)]
pub coords: Src,
/// Packed parameter containing both bounds (inclusive)
/// and other information (explained in more details in Foldable):
/// 0..20: bound (inclusive)
/// 21: pitch_linear (used if mode == StoredInDescriptor)
/// 22..26: coord shl
/// 26..29: coord shr
/// 29..32: n. of tiles
#[src_type(ALU)]
pub params: Src,
/// Added to the coords, it's only an i6
pub imm: i8,
}
impl Foldable for OpSuClamp {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let src = f.get_u32_src(self, &self.coords);
let params = f.get_u32_src(self, &self.params);
let imm = self.imm; // i6
let src = if self.is_s32 {
(src as i32) as i64
} else {
src as i64
};
let src = src + (imm as i64);
let params_bv = BitView::new(&params);
let pitch_linear = match self.mode {
SuClampMode::StoredInDescriptor => params_bv.get_bit(21),
SuClampMode::PitchLinear => true,
SuClampMode::BlockLinear => false,
};
let bounds = if pitch_linear && !self.is_2d {
params
} else {
params_bv.get_bit_range_u64(0..20) as u32
};
let bounds = bounds & self.round.to_mask();
let (is_oob, clamped) = if src < 0 {
(true, 0)
} else if src > (bounds as i64) {
(true, bounds)
} else {
(false, src as u32)
};
let mut out = 0u32;
let mut bv = BitMutView::new(&mut out);
if pitch_linear {
if !self.is_2d {
// simple clamp mode, NO BITFIELD
bv.set_field(0..32, clamped);
} else {
// Real, pitch_linear mode
bv.set_field(0..20, clamped & 0xfffff);
// Pass through el_size_log2
bv.set_field(27..30, params_bv.get_bit_range_u64(26..29));
bv.set_bit(30, true); // pitch_linear=true
bv.set_bit(31, is_oob);
}
} else {
// Block linear
// Number of bits to discard for GoB coordinates
let shr_a = params_bv.get_bit_range_u64(22..26) as u8;
// Block coords
bv.set_field(0..16, (clamped >> shr_a) & 0xffff);
// Shift applied to coords, always zero except for x.
// (for coord x=1 and format R32, we want to access byte 4)
// e.g. R8 -> 0, R32 -> 2, 128 -> 4
let el_size_log2 = params_bv.get_bit_range_u64(26..29) as u8;
// Coord inside GoB (element space)
bv.set_field(16..24, (clamped << el_size_log2) & 0xff);
// Useful later to compute gob-space coords.
let n_tiles = params_bv.get_bit_range_u64(29..32) as u8;
bv.set_field(27..30, n_tiles);
bv.set_bit(30, false); // pitch_linear=false
bv.set_bit(31, is_oob);
}
f.set_u32_dst(self, &self.dst, out);
f.set_pred_dst(self, &self.out_of_bounds, is_oob);
}
}
impl DisplayOp for OpSuClamp {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "suclamp{}", self.mode)?;
if !matches!(self.round, SuClampRound::R1) {
write!(f, "{}", self.round)?;
}
if !self.is_s32 {
write!(f, ".u32")?;
}
if !self.is_2d {
write!(f, ".1d")?;
}
write! {f, " {} {} {:x}", self.coords, self.params, self.imm}
}
}
impl_display_for_op!(OpSuClamp);
/// Kepler only
/// BitField Merge
///
/// The resulting bit-field is composed of a high-part 8..32 that is merged
/// with the address by sueau, and a lower-part 0..8 that is provided
/// directly to suldga/sustga and defines the lower offset of the glonal array.
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice, Clone)]
pub struct OpSuBfm {
#[dst_type(GPR)]
pub dst: Dst,
#[dst_type(Pred)]
pub pdst: Dst,
/// x, y, z
#[src_type(ALU)]
pub srcs: [Src; 3],
/// When is_3d=false the third source is ignored, but still used in
/// pitch-linear computation.
pub is_3d: bool,
}
impl Foldable for OpSuBfm {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let x_raw = f.get_u32_src(self, &self.srcs[0]);
let y_raw = f.get_u32_src(self, &self.srcs[1]);
let z_raw = f.get_u32_src(self, &self.srcs[2]);
let x = BitView::new(&x_raw);
let y = BitView::new(&y_raw);
let z = BitView::new(&z_raw);
let mut o_raw = 0u32;
let mut o = BitMutView::new(&mut o_raw);
let is_pitch_linear_2d = x.get_bit(30) || y.get_bit(30);
if !is_pitch_linear_2d {
// Copy coordinates inside of GoB space.
// They are 6 bits from x and 3 from y (GoB is 64x8 bytes).
// Bits from 0..8 are ignored by sueau and are used directly
// by suldga/sustga.
// Bit 9 will become the first bit of the higher part in
// sueau.
o.set_bit_range_u64(0..4, x.get_bit_range_u64(16..20));
// Address calculation inside of GoB should virtually be
// y * 64 + x * element_size (each row is linear).
// So why are those bits swizzled like so?
// I have no idea, but these are correct even for atomics
// that accept real addresses.
o.set_bit(4, y.get_bit(16));
o.set_bit(5, y.get_bit(17));
o.set_bit(6, x.get_bit(20));
o.set_bit(7, y.get_bit(18));
o.set_bit(8, x.get_bit(21));
// 9..11: 0
// -------------- Tiles --------------
// Number of tiles log2
let ntx = x.get_bit_range_u64(27..30) & 0x1;
let nty = y.get_bit_range_u64(27..30);
let ntz = z.get_bit_range_u64(27..30);
let ntz = ntz * (self.is_3d as u64); // z is ignored if is_3d=false
// Computes how many bits to dedicate to GoB coords inside
// a block
o.set_field(12..16, ntx + nty + ntz);
// Coords in gob_space.
// Remove 6 bits from x and 3 bits from y, those are used
// as element coords in GoB space.
let a = x.get_bit_range_u64(22..24); // 1100_0000
let b = y.get_bit_range_u64(19..24); // 1111_1000
let c = z.get_bit_range_u64(16..24); // 1111_1111
// nt* indicates how many bits to consider (max 5)
let a = a & ((1 << ntx) - 1);
let b = b & ((1 << nty.min(5)) - 1);
let c = c & ((1 << ntz.min(5)) - 1);
// Compute gob offset
// We can just or together at certain offsets because
// Tiles are always powers of two in each direction.
// z || y || x (LSB)
let res = c;
let res = (res << nty) | b;
let res = (res << ntx) | a;
let mask = match ntx {
0 => 0x3ff,
_ => 0x7ff,
};
// gob coords will be put before the block coords in
// sueau.
o.set_field(16..27, res & mask);
} else {
let d = z.get_bit_range_u64(0..8);
let el_size_log2 = x.get_bit_range_u64(27..30);
o.set_field(0..8, (d << el_size_log2) & 0xff);
// 9..11: 0
o.set_field(12..15, el_size_log2);
}
o.set_bit(11, is_pitch_linear_2d);
let is_oob =
x.get_bit(31) || y.get_bit(31) || (z.get_bit(31) && self.is_3d);
f.set_u32_dst(self, &self.dst, o_raw);
f.set_pred_dst(self, &self.pdst, is_oob);
}
}
impl DisplayOp for OpSuBfm {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "subfm")?;
if self.is_3d {
write!(f, ".3d")?;
}
write!(f, " {} {} {}", self.srcs[0], self.srcs[1], self.srcs[2])
}
}
impl_display_for_op!(OpSuBfm);
/// Kepler only
/// Used to compute the higher 32 bits of image address using
/// the merged bitfield and the block coordinates (offset).
/// It can switch to a pitch_linear mode (bit 11 of bit-field).
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice, Clone)]
pub struct OpSuEau {
#[dst_type(GPR)]
pub dst: Dst,
/// offset is computed from the block coordinates.
/// it's ok to add it directly to the address since they are both
/// "aligned" to 64 (the first 8 bits are removed from both)
#[src_type(GPR)]
pub off: Src,
/// 8.. 9: offset, last bit
/// 11..12: pitch_linear: when enabled the bf-offset is ignored and
/// the off_shl is subtracted by 8
/// 12..16: off_shl, shifts left the offset by off_shl + 1
/// 16..27: 11-bit offset, when joined with the 1-bit offset completes the
/// 12-bit offset ORed to the src offset after shifting
/// (unless pitch_linear)
#[src_type(ALU)]
pub bit_field: Src,
#[src_type(GPR)]
pub addr: Src,
}
impl Foldable for OpSuEau {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let off_raw = f.get_u32_src(self, &self.off);
let bf_raw = f.get_u32_src(self, &self.bit_field);
let addr = f.get_u32_src(self, &self.addr);
let bf = BitView::new(&bf_raw);
let off1 = bf.get_bit_range_u64(8..9) as u32;
let is_pitch_linear = bf.get_bit(11);
let off_shift = bf.get_bit_range_u64(12..16) as u32;
let offs = bf.get_bit_range_u64(16..27) as u32;
let res = if !is_pitch_linear {
// Block linear
// off_raw are the block coordinates
// to those we add gob coordinates from the merged bitfield
// and the MSB of in-gob coordinates.
let omul = off_shift + 1;
let real_off = (off_raw << omul) | (offs << 1) | off1;
addr.wrapping_add(real_off & 0x7ff_ffff)
} else {
// Add the high part of the coordinates to addr
// off << (omul - 8)
// but for negative values do a shr instead.
// In fact, off_shift will always be < 8 because pitch_linear
// subfm only assigns bits 12..15, so this is always a shr
let shl_amount = off_shift as i32 - 8;
let off = if shl_amount < 0 {
off_raw >> (-shl_amount as u32)
} else {
off_raw << (shl_amount as u32)
};
addr.wrapping_add(off & 0xff_ffff)
};
f.set_u32_dst(self, &self.dst, res);
}
}
impl DisplayOp for OpSuEau {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write! {f, "sueau {} {} {}", self.off, self.bit_field, self.addr}
}
}
impl_display_for_op!(OpSuEau);
#[derive(Copy, Clone, Debug)]
pub enum IMadSpSrcType {
U32,
U24,
U16Hi,
U16Lo,
S32,
S24,
S16Hi,
S16Lo,
}
impl IMadSpSrcType {
pub fn unsigned(self) -> IMadSpSrcType {
use IMadSpSrcType::*;
match self {
S32 => U32,
S24 => U24,
S16Hi => U16Hi,
S16Lo => U16Lo,
x => x,
}
}
#[allow(dead_code)] // Used in hw_tests
pub fn with_sign(self, sign: bool) -> Self {
use IMadSpSrcType::*;
if !sign {
return self.unsigned();
}
match self {
U32 => S32,
U24 => S24,
U16Hi => S16Hi,
U16Lo => S16Lo,
x => x,
}
}
pub fn sign(self) -> bool {
use IMadSpSrcType::*;
match self {
U32 | U24 | U16Hi | U16Lo => false,
S32 | S24 | S16Hi | S16Lo => true,
}
}
#[allow(dead_code)]
fn cast(&self, v: u32) -> i64 {
use IMadSpSrcType::*;
match self {
U32 => v as i64,
U24 => (v & 0x00ff_ffff) as i64,
U16Lo => (v as u16) as i64,
U16Hi => (v >> 16) as i64,
S32 => (v as i32) as i64,
S24 => (((v as i32) << 8) >> 8) as i64, // Sign extend
S16Lo => (v as i16) as i64,
S16Hi => ((v >> 16) as i16) as i64,
}
}
}
impl fmt::Display for IMadSpSrcType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let sign = if self.sign() { ".s" } else { ".u" };
let width = match self.unsigned() {
IMadSpSrcType::U32 => "32",
IMadSpSrcType::U24 => "24",
IMadSpSrcType::U16Lo => "16h0",
IMadSpSrcType::U16Hi => "16h1",
_ => unreachable!(),
};
write!(f, "{}{}", sign, width)
}
}
#[derive(Clone, Copy, Debug)]
pub enum IMadSpMode {
Explicit([IMadSpSrcType; 3]),
// Parameters are loaded from src1 bits 26..32
FromSrc1,
}
impl fmt::Display for IMadSpMode {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
IMadSpMode::Explicit([a, b, c]) => write!(f, "{a}{b}{c}"),
IMadSpMode::FromSrc1 => write!(f, ".sd"),
}
}
}
/// Kepler only
/// Extracted Integer Multiply and Add.
/// It does the same operation as an imad op, but it can extract the
/// sources from a subset of the register (only 32, 24 or 16 bits).
/// It can also do a "load parameters" mode where the modifiers are
/// loaded from the higher bits in src2 (check Foldable impl for details).
/// Limits: src1 can never be U32 or U16Hi,
/// src2 can never be U16Hi
/// src2 signedness is tied to src1 and src0 signedness,
/// if either is signed, src2 must be signed too.
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice, Clone)]
pub struct OpIMadSp {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(ALU)]
pub srcs: [Src; 3],
pub mode: IMadSpMode,
}
impl Foldable for OpIMadSp {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let src0 = f.get_u32_src(self, &self.srcs[0]);
let src1 = f.get_u32_src(self, &self.srcs[1]);
let src2 = f.get_u32_src(self, &self.srcs[2]);
let (src_type0, src_type1, src_type2) = match self.mode {
IMadSpMode::Explicit([t0, t1, t2]) => (t0, t1, t2),
IMadSpMode::FromSrc1 => {
let params = BitView::new(&src1);
let st2 = params.get_bit_range_u64(26..28) as usize;
let st1 = params.get_bit_range_u64(28..30) as usize;
let st0 = params.get_bit_range_u64(30..32) as usize;
use IMadSpSrcType::*;
let types0 = [U32, U24, U16Lo, U16Hi];
let types1 = [U16Lo, U24, U16Lo, U24];
let types2 = [U32, U24, U16Lo, U32];
(
types0[st0].unsigned(),
types1[st1].unsigned(),
types2[st2].unsigned(),
)
}
};
let src0 = src_type0.cast(src0);
let src1 = src_type1.cast(src1);
let src2 = src_type2.cast(src2);
f.set_u32_dst(self, &self.dst, (src0 * src1 + src2) as u32);
}
}
impl DisplayOp for OpIMadSp {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"imadsp{} {} {} {}",
self.mode, self.srcs[0], self.srcs[1], self.srcs[2]
)
}
}
impl_display_for_op!(OpIMadSp);
/// In SuGa ops, the address is always specified in two parts, the higher
/// part contains the base address without the lower 8 bits (base_addr >> 8),
/// while the lower part might contain either the missing 8 bits (U8) or
/// a full 32-bit offset that must not be shifted (U32).
///
/// In short:
/// U8 : real_address = (addr_hi << 8) + (addr_lo & 0xFF)
/// U32: real_address = (addr_hi << 8) + addr_lo
/// The signed variants do the same but with sign extension probably
#[derive(Clone, Copy)]
pub enum SuGaOffsetMode {
U32,
S32,
U8,
S8,
}
/// Kepler only
/// Load a pixel from an image, takes the pixel address and format as an
/// argument. Since the image coordinates are not present, the instruction
/// also needs an `out_of_bounds` predicate, when true it always load (0, 0, 0, 1)
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpSuLdGa {
pub dst: Dst,
pub mem_type: MemType,
pub offset_mode: SuGaOffsetMode,
pub cache_op: LdCacheOp,
/// Format for the loaded data, passed directly from the descriptor.
#[src_type(GPR)]
pub format: Src,
/// This is not an address, but it's two registers that contain
/// [addr >> 8, addr & 0xff].
/// This works because addr >> 8 is 32-bits (GOB-aligned) and the
/// rest 8-bits are extracted by the bit-field
/// It's useful since in block-linear mode the lower bits and the higher
/// bits are computed in different ways.
#[src_type(SSA)]
pub addr: Src,
#[src_type(Pred)]
pub out_of_bounds: Src,
}
impl DisplayOp for OpSuLdGa {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"suldga{}{} [{}] {} {}",
self.mem_type,
self.cache_op,
self.addr,
self.format,
self.out_of_bounds
)
}
}
impl_display_for_op!(OpSuLdGa);
/// Kepler only
/// Store a pixel in an image, takes the pixel address and format as an
/// argument. Since the image coordinates are not present, the instruction
/// also needs an `out_of_bounds` predicate, when true, stores are ingored
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpSuStGa {
pub image_access: ImageAccess,
pub offset_mode: SuGaOffsetMode,
pub cache_op: StCacheOp,
#[src_type(GPR)]
pub format: Src,
#[src_type(SSA)]
pub addr: Src,
#[src_type(SSA)]
pub data: Src,
#[src_type(Pred)]
pub out_of_bounds: Src,
}
impl DisplayOp for OpSuStGa {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"sustga{}{} [{}] {} {} {}",
self.image_access,
self.cache_op,
self.addr,
self.format,
self.data,
self.out_of_bounds,
)
}
}
impl_display_for_op!(OpSuStGa);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpLd {
pub dst: Dst,
#[src_type(GPR)]
pub addr: Src,
pub offset: i32,
pub access: MemAccess,
}
impl DisplayOp for OpLd {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "ld{} [{}", self.access, self.addr)?;
if self.offset > 0 {
write!(f, "+{:#x}", self.offset)?;
}
write!(f, "]")
}
}
impl_display_for_op!(OpLd);
#[allow(dead_code)]
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub enum LdcMode {
Indexed,
IndexedLinear,
IndexedSegmented,
IndexedSegmentedLinear,
}
impl fmt::Display for LdcMode {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
LdcMode::Indexed => Ok(()),
LdcMode::IndexedLinear => write!(f, ".il"),
LdcMode::IndexedSegmented => write!(f, ".is"),
LdcMode::IndexedSegmentedLinear => write!(f, ".isl"),
}
}
}
#[repr(C)]
#[derive(Clone, SrcsAsSlice, DstsAsSlice)]
pub struct OpLdc {
pub dst: Dst,
#[src_type(ALU)]
pub cb: Src,
#[src_type(GPR)]
pub offset: Src,
pub mode: LdcMode,
pub mem_type: MemType,
}
impl DisplayOp for OpLdc {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let SrcRef::CBuf(cb) = &self.cb.src_ref else {
panic!("Not a cbuf");
};
write!(f, "ldc{}{} {}[", self.mode, self.mem_type, cb.buf)?;
if self.offset.is_zero() {
write!(f, "+{:#x}", cb.offset)?;
} else if cb.offset == 0 {
write!(f, "{}", self.offset)?;
} else {
write!(f, "{}+{:#x}", self.offset, cb.offset)?;
}
write!(f, "]")
}
}
impl_display_for_op!(OpLdc);
#[derive(Clone, Copy, Eq, PartialEq)]
#[allow(dead_code)]
pub enum LdsmSize {
M8N8,
MT8N8,
}
impl fmt::Display for LdsmSize {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
LdsmSize::M8N8 => write!(f, "m8n8"),
LdsmSize::MT8N8 => write!(f, "m8n8.trans"),
}
}
}
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpLdsm {
#[dst_type(Vec)]
pub dst: Dst,
pub mat_size: LdsmSize,
pub mat_count: u8,
#[src_type(SSA)]
pub addr: Src,
pub offset: i32,
}
impl DisplayOp for OpLdsm {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"ldsm.16.{}.x{} [{}",
self.mat_size, self.mat_count, self.addr,
)?;
if self.offset > 0 {
write!(f, "+{:#x}", self.offset)?;
}
write!(f, "]")
}
}
impl_display_for_op!(OpLdsm);
/// Used for Kepler to implement shared atomics.
/// In addition to the load, it tries to lock the address,
/// Kepler hardware has (1024?) hardware mutex locks.
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpLdSharedLock {
pub dst: Dst,
#[dst_type(Pred)]
pub locked: Dst,
#[src_type(GPR)]
pub addr: Src,
pub offset: i32,
pub mem_type: MemType,
}
impl DisplayOp for OpLdSharedLock {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "ldslk{} [{}", self.mem_type, self.addr)?;
if self.offset > 0 {
write!(f, "+{:#x}", self.offset)?;
}
write!(f, "]")
}
}
impl_display_for_op!(OpLdSharedLock);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpSt {
#[src_type(GPR)]
pub addr: Src,
#[src_type(SSA)]
pub data: Src,
pub offset: i32,
pub access: MemAccess,
}
impl DisplayOp for OpSt {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "st{} [{}", self.access, self.addr)?;
if self.offset > 0 {
write!(f, "+{:#x}", self.offset)?;
}
write!(f, "] {}", self.data)
}
}
impl_display_for_op!(OpSt);
/// Used for Kepler to implement shared atomics.
/// It checks that the address is still properly locked, performs the
/// store operation and unlocks the previously unlocked address.
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpStSCheckUnlock {
#[dst_type(Pred)]
pub locked: Dst,
#[src_type(GPR)]
pub addr: Src,
#[src_type(SSA)]
pub data: Src,
pub offset: i32,
pub mem_type: MemType,
}
impl DisplayOp for OpStSCheckUnlock {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "stscul{} [{}", self.mem_type, self.addr)?;
if self.offset > 0 {
write!(f, "+{:#x}", self.offset)?;
}
write!(f, "] {}", self.data)
}
}
impl_display_for_op!(OpStSCheckUnlock);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpAtom {
pub dst: Dst,
#[src_type(GPR)]
pub addr: Src,
#[src_type(GPR)]
pub cmpr: Src,
#[src_type(SSA)]
pub data: Src,
pub atom_op: AtomOp,
pub atom_type: AtomType,
pub addr_offset: i32,
pub mem_space: MemSpace,
pub mem_order: MemOrder,
pub mem_eviction_priority: MemEvictionPriority,
}
impl DisplayOp for OpAtom {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"atom{}{}{}{}{}",
self.atom_op,
self.atom_type,
self.mem_space,
self.mem_order,
self.mem_eviction_priority,
)?;
write!(f, " [")?;
if !self.addr.is_zero() {
write!(f, "{}", self.addr)?;
}
if self.addr_offset > 0 {
if !self.addr.is_zero() {
write!(f, "+")?;
}
write!(f, "{:#x}", self.addr_offset)?;
}
write!(f, "]")?;
if self.atom_op == AtomOp::CmpExch(AtomCmpSrc::Separate) {
write!(f, " {}", self.cmpr)?;
}
write!(f, " {}", self.data)
}
}
impl_display_for_op!(OpAtom);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpAL2P {
pub dst: Dst,
#[src_type(GPR)]
pub offset: Src,
pub addr: u16,
pub comps: u8,
pub output: bool,
}
impl DisplayOp for OpAL2P {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "al2p")?;
if self.output {
write!(f, ".o")?;
}
write!(f, " a[{:#x}", self.addr)?;
if !self.offset.is_zero() {
write!(f, "+{}", self.offset)?;
}
write!(f, "]")
}
}
impl_display_for_op!(OpAL2P);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpALd {
pub dst: Dst,
#[src_type(GPR)]
pub vtx: Src,
#[src_type(GPR)]
pub offset: Src,
pub addr: u16,
pub comps: u8,
pub patch: bool,
pub output: bool,
pub phys: bool,
}
impl DisplayOp for OpALd {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "ald")?;
if self.output {
write!(f, ".o")?;
}
if self.patch {
write!(f, ".p")?;
}
if self.phys {
write!(f, ".phys")?;
}
write!(f, " a")?;
if !self.vtx.is_zero() {
write!(f, "[{}]", self.vtx)?;
}
write!(f, "[{:#x}", self.addr)?;
if !self.offset.is_zero() {
write!(f, "+{}", self.offset)?;
}
write!(f, "]")
}
}
impl_display_for_op!(OpALd);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpASt {
#[src_type(GPR)]
pub vtx: Src,
#[src_type(GPR)]
pub offset: Src,
#[src_type(SSA)]
pub data: Src,
pub addr: u16,
pub comps: u8,
pub patch: bool,
pub phys: bool,
}
impl DisplayOp for OpASt {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "ast")?;
if self.patch {
write!(f, ".p")?;
}
if self.phys {
write!(f, ".phys")?;
}
write!(f, " a")?;
if !self.vtx.is_zero() {
write!(f, "[{}]", self.vtx)?;
}
write!(f, "[{:#x}", self.addr)?;
if !self.offset.is_zero() {
write!(f, "+{}", self.offset)?;
}
write!(f, "] {}", self.data)
}
}
impl_display_for_op!(OpASt);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpIpa {
pub dst: Dst,
pub addr: u16,
pub freq: InterpFreq,
pub loc: InterpLoc,
pub inv_w: Src,
pub offset: Src,
}
impl DisplayOp for OpIpa {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"ipa{}{} a[{:#x}] {}",
self.freq, self.loc, self.addr, self.inv_w
)?;
if self.loc == InterpLoc::Offset {
write!(f, " {}", self.offset)?;
}
Ok(())
}
}
impl_display_for_op!(OpIpa);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpLdTram {
pub dst: Dst,
pub addr: u16,
pub use_c: bool,
}
impl DisplayOp for OpLdTram {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "ldtram")?;
if self.use_c {
write!(f, ".c")?;
} else {
write!(f, ".ab")?;
}
write!(f, " a[{:#x}]", self.addr)?;
Ok(())
}
}
impl_display_for_op!(OpLdTram);
#[allow(dead_code)]
#[derive(Copy, Clone, Debug)]
pub enum CCtlOp {
Qry1, // Only available pre-Volta
PF1,
PF1_5, // Only available pre-Volta
PF2,
WB,
IV,
IVAll,
RS,
RSLB, // Only available pre-Volta
IVAllP, // Only available on Volta+
WBAll, // Only available on Volta+
WBAllP, // Only available on Volta+
}
impl CCtlOp {
pub fn is_all(&self) -> bool {
match self {
CCtlOp::Qry1
| CCtlOp::PF1
| CCtlOp::PF1_5
| CCtlOp::PF2
| CCtlOp::WB
| CCtlOp::IV
| CCtlOp::RS
| CCtlOp::RSLB => false,
CCtlOp::IVAll | CCtlOp::IVAllP | CCtlOp::WBAll | CCtlOp::WBAllP => {
true
}
}
}
}
impl fmt::Display for CCtlOp {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
CCtlOp::Qry1 => write!(f, "qry1"),
CCtlOp::PF1 => write!(f, "pf1"),
CCtlOp::PF1_5 => write!(f, "pf1.5"),
CCtlOp::PF2 => write!(f, "pf2"),
CCtlOp::WB => write!(f, "wb"),
CCtlOp::IV => write!(f, "iv"),
CCtlOp::IVAll => write!(f, "ivall"),
CCtlOp::RS => write!(f, "rs"),
CCtlOp::RSLB => write!(f, "rslb"),
CCtlOp::IVAllP => write!(f, "ivallp"),
CCtlOp::WBAll => write!(f, "wball"),
CCtlOp::WBAllP => write!(f, "wballp"),
}
}
}
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpCCtl {
pub op: CCtlOp,
pub mem_space: MemSpace,
#[src_type(GPR)]
pub addr: Src,
pub addr_offset: i32,
}
impl DisplayOp for OpCCtl {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "cctl{}", self.mem_space)?;
if !self.op.is_all() {
write!(f, " [{}", self.addr)?;
if self.addr_offset > 0 {
write!(f, "+{:#x}", self.addr_offset)?;
}
write!(f, "]")?;
}
Ok(())
}
}
impl_display_for_op!(OpCCtl);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpMemBar {
pub scope: MemScope,
}
impl DisplayOp for OpMemBar {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "membar.sc.{}", self.scope)
}
}
impl_display_for_op!(OpMemBar);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpBClear {
pub dst: Dst,
}
impl DisplayOp for OpBClear {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "bclear")
}
}
impl_display_for_op!(OpBClear);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpBMov {
pub dst: Dst,
pub src: Src,
pub clear: bool,
}
impl DisplayOp for OpBMov {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "bmov.32")?;
if self.clear {
write!(f, ".clear")?;
}
write!(f, " {}", self.src)
}
}
impl_display_for_op!(OpBMov);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpBreak {
#[dst_type(Bar)]
pub bar_out: Dst,
#[src_type(Bar)]
pub bar_in: Src,
#[src_type(Pred)]
pub cond: Src,
}
impl DisplayOp for OpBreak {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "break {} {}", self.bar_in, self.cond)
}
}
impl_display_for_op!(OpBreak);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpBSSy {
#[dst_type(Bar)]
pub bar_out: Dst,
#[src_type(Pred)]
pub bar_in: Src,
#[src_type(Pred)]
pub cond: Src,
pub target: Label,
}
impl DisplayOp for OpBSSy {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "bssy {} {} {}", self.bar_in, self.cond, self.target)
}
}
impl_display_for_op!(OpBSSy);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpBSync {
#[src_type(Bar)]
pub bar: Src,
#[src_type(Pred)]
pub cond: Src,
}
impl DisplayOp for OpBSync {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "bsync {} {}", self.bar, self.cond)
}
}
impl_display_for_op!(OpBSync);
/// Takes the branch when the guard predicate and all sources evaluate to true.
#[repr(C)]
#[derive(Clone, SrcsAsSlice, DstsAsSlice)]
pub struct OpBra {
pub target: Label,
/// Can be a UPred if uniform
// TODO: actually .u has another form with an additional UPred input.
#[src_type(Pred)]
pub cond: Src,
}
impl DisplayOp for OpBra {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "bra {} {}", self.cond, self.target)
}
}
impl_display_for_op!(OpBra);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpSSy {
pub target: Label,
}
impl DisplayOp for OpSSy {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "ssy {}", self.target)
}
}
impl_display_for_op!(OpSSy);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpSync {
pub target: Label,
}
impl DisplayOp for OpSync {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "sync {}", self.target)
}
}
impl_display_for_op!(OpSync);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpBrk {
pub target: Label,
}
impl DisplayOp for OpBrk {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "brk {}", self.target)
}
}
impl_display_for_op!(OpBrk);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpPBk {
pub target: Label,
}
impl DisplayOp for OpPBk {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "pbk {}", self.target)
}
}
impl_display_for_op!(OpPBk);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpCont {
pub target: Label,
}
impl DisplayOp for OpCont {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "cont {}", self.target)
}
}
impl_display_for_op!(OpCont);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpPCnt {
pub target: Label,
}
impl DisplayOp for OpPCnt {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "pcnt {}", self.target)
}
}
impl_display_for_op!(OpPCnt);
#[repr(C)]
#[derive(Clone, SrcsAsSlice, DstsAsSlice)]
pub struct OpExit {}
impl DisplayOp for OpExit {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "exit")
}
}
impl_display_for_op!(OpExit);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpWarpSync {
pub mask: u32,
}
impl DisplayOp for OpWarpSync {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "warpsync 0x{:x}", self.mask)
}
}
impl_display_for_op!(OpWarpSync);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpBar {}
impl DisplayOp for OpBar {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "bar.sync")
}
}
impl_display_for_op!(OpBar);
/// Instruction only used on Kepler(A|B).
/// Kepler has explicit dependency tracking for texture loads.
/// When a texture load is executed, it is put on some kind of FIFO queue
/// for later execution.
/// Before the results of a texture are used we need to wait on the queue,
/// texdepbar waits until the queue has at most `textures_left` elements.
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpTexDepBar {
pub textures_left: u8,
}
impl OpTexDepBar {
/// Maximum value of textures_left
///
/// The maximum encodable value is 63. However, nvcc starts emitting
/// TEXDEPBAR 0x3e as soon as it hits 62 texture instructions.
pub const MAX_TEXTURES_LEFT: u8 = 62;
}
impl DisplayOp for OpTexDepBar {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "texdepbar {}", self.textures_left)
}
}
impl_display_for_op!(OpTexDepBar);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpCS2R {
pub dst: Dst,
pub idx: u8,
}
impl DisplayOp for OpCS2R {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "cs2r sr[{:#x}]", self.idx)
}
}
impl_display_for_op!(OpCS2R);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpIsberd {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(SSA)]
pub idx: Src,
}
impl DisplayOp for OpIsberd {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "isberd [{}]", self.idx)
}
}
impl_display_for_op!(OpIsberd);
/// Vertex Index Load
/// (Only available in Kepler)
///
/// Takes as input the vertex index and loads the vertex address in
/// attribute space.
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpViLd {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(SSA)]
pub idx: Src,
pub off: i8,
}
impl DisplayOp for OpViLd {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "vild v[")?;
if !self.idx.is_zero() {
write!(f, "{}", self.idx)?;
if self.off != 0 {
write!(f, "{:+}", self.off)?;
}
} else {
write!(f, "{}", self.off)?;
}
write!(f, "]")
}
}
impl_display_for_op!(OpViLd);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpKill {}
impl DisplayOp for OpKill {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "kill")
}
}
impl_display_for_op!(OpKill);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpNop {
pub label: Option<Label>,
}
impl DisplayOp for OpNop {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "nop")?;
if let Some(label) = &self.label {
write!(f, " {}", label)?;
}
Ok(())
}
}
impl_display_for_op!(OpNop);
#[allow(dead_code)]
pub enum PixVal {
MsCount,
CovMask,
Covered,
Offset,
CentroidOffset,
MyIndex,
InnerCoverage,
}
impl fmt::Display for PixVal {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
PixVal::MsCount => write!(f, ".mscount"),
PixVal::CovMask => write!(f, ".covmask"),
PixVal::Covered => write!(f, ".covered"),
PixVal::Offset => write!(f, ".offset"),
PixVal::CentroidOffset => write!(f, ".centroid_offset"),
PixVal::MyIndex => write!(f, ".my_index"),
PixVal::InnerCoverage => write!(f, ".inner_coverage"),
}
}
}
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpPixLd {
pub dst: Dst,
pub val: PixVal,
}
impl DisplayOp for OpPixLd {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "pixld{}", self.val)
}
}
impl_display_for_op!(OpPixLd);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpS2R {
pub dst: Dst,
pub idx: u8,
}
impl DisplayOp for OpS2R {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "s2r sr[{:#x}]", self.idx)
}
}
impl_display_for_op!(OpS2R);
pub enum VoteOp {
Any,
All,
Eq,
}
impl fmt::Display for VoteOp {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
VoteOp::Any => write!(f, "any"),
VoteOp::All => write!(f, "all"),
VoteOp::Eq => write!(f, "eq"),
}
}
}
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpVote {
pub op: VoteOp,
#[dst_type(GPR)]
pub ballot: Dst,
#[dst_type(Pred)]
pub vote: Dst,
#[src_type(Pred)]
pub pred: Src,
}
impl DisplayOp for OpVote {
fn fmt_dsts(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.ballot.is_none() && self.vote.is_none() {
write!(f, "none")
} else {
if !self.ballot.is_none() {
write!(f, "{}", self.ballot)?;
}
if !self.vote.is_none() {
write!(f, "{}", self.vote)?;
}
Ok(())
}
}
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "vote.{} {}", self.op, self.pred)
}
}
impl_display_for_op!(OpVote);
#[allow(dead_code)]
#[derive(Copy, Clone)]
pub enum MatchOp {
All,
Any,
}
impl fmt::Display for MatchOp {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
MatchOp::All => write!(f, ".all"),
MatchOp::Any => write!(f, ".any"),
}
}
}
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpMatch {
#[dst_type(Pred)]
pub pred: Dst,
#[dst_type(GPR)]
pub mask: Dst,
#[src_type(GPR)]
pub src: Src,
pub op: MatchOp,
pub u64: bool,
}
impl DisplayOp for OpMatch {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let u64_str = if self.u64 { ".u64" } else { "" };
write!(f, "match{}{} {}", self.op, u64_str, self.src)
}
}
impl_display_for_op!(OpMatch);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpUndef {
pub dst: Dst,
}
impl DisplayOp for OpUndef {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "undef {}", self.dst)
}
}
impl_display_for_op!(OpUndef);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpSrcBar {
pub src: Src,
}
impl DisplayOp for OpSrcBar {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "src_bar {}", self.src)
}
}
impl_display_for_op!(OpSrcBar);
pub struct VecPair<A, B> {
a: Vec<A>,
b: Vec<B>,
}
impl<A, B> VecPair<A, B> {
pub fn append(&mut self, other: &mut VecPair<A, B>) {
self.a.append(&mut other.a);
self.b.append(&mut other.b);
}
pub fn is_empty(&self) -> bool {
debug_assert!(self.a.len() == self.b.len());
self.a.is_empty()
}
pub fn iter(&self) -> Zip<slice::Iter<'_, A>, slice::Iter<'_, B>> {
debug_assert!(self.a.len() == self.b.len());
self.a.iter().zip(self.b.iter())
}
pub fn iter_mut(
&mut self,
) -> Zip<slice::IterMut<'_, A>, slice::IterMut<'_, B>> {
debug_assert!(self.a.len() == self.b.len());
self.a.iter_mut().zip(self.b.iter_mut())
}
pub fn len(&self) -> usize {
debug_assert!(self.a.len() == self.b.len());
self.a.len()
}
pub fn new() -> Self {
Self {
a: Vec::new(),
b: Vec::new(),
}
}
pub fn push(&mut self, a: A, b: B) {
debug_assert!(self.a.len() == self.b.len());
self.a.push(a);
self.b.push(b);
}
}
impl<A: Clone, B: Clone> VecPair<A, B> {
pub fn retain(&mut self, mut f: impl FnMut(&A, &B) -> bool) {
debug_assert!(self.a.len() == self.b.len());
let len = self.a.len();
let mut i = 0_usize;
while i < len {
if !f(&self.a[i], &self.b[i]) {
break;
}
i += 1;
}
let mut new_len = i;
// Don't check this one twice.
i += 1;
while i < len {
// This could be more efficient but it's good enough for our
// purposes since everything we're storing is small and has a
// trivial Drop.
if f(&self.a[i], &self.b[i]) {
self.a[new_len] = self.a[i].clone();
self.b[new_len] = self.b[i].clone();
new_len += 1;
}
i += 1;
}
if new_len < len {
self.a.truncate(new_len);
self.b.truncate(new_len);
}
}
}
mod phi {
#[allow(unused_imports)]
use crate::ir::{OpPhiDsts, OpPhiSrcs};
use compiler::bitset::IntoBitIndex;
use std::fmt;
/// A phi node
///
/// Phis in NAK are implemented differently from NIR and similar IRs.
/// Instead of having a single phi instruction which lives in the successor
/// block, each `Phi` represents a single merged 32-bit (or 1-bit for
/// predicates) value and we have separate [`OpPhiSrcs`] and [`OpPhiDsts`]
/// instructions which map phis to sources and destinations.
///
/// One of the problems fundamental to phis is that they really live on the
/// edges between blocks. Regardless of where the phi instruction lives in
/// the IR data structures, its sources are consumed at the end of the
/// predecessor block and its destinations are defined at the start of the
/// successor block and all phi sources and destinations get consumed and go
/// live simultaneously for any given CFG edge. For a phi that participates
/// in a back-edge, this means that the source of the phi may be consumed
/// after (in block order) the destination goes live.
///
/// In NIR, this has caused no end of headaches. Most passes which need to
/// process phis ignore phis when first processing a block and then have a
/// special case at the end of each block which walks the successors and
/// processes the successor's phis, looking only at the phi sources whose
/// predecessor matches the block. This is clunky and often forgotten by
/// optimization and lowering pass authors. It's also easy to get missed by
/// testing since it only really breaks if you have a phi which participates
/// in a back-edge so it often gets found later when something breaks in the
/// wild.
///
/// To work around this (and also make things a little more Rust-friendly),
/// NAK places the instruction which consumes phi sources at the end of the
/// predecessor block and the instruction which defines phi destinations at
/// the start of the successor block. This structurally eliminates the
/// problem that has plagued NIR for years. The cost to this solution is
/// that we have to create maps from phis to/from SSA values whenever we
/// want to optimize the phis themselves. However, this affects few enough
/// passes that the benefits to the rest of the IR are worth the trade-off,
/// at least for a back-end compiler.
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub struct Phi {
idx: u32,
}
impl IntoBitIndex for Phi {
fn into_bit_index(self) -> usize {
self.idx.try_into().unwrap()
}
}
impl fmt::Display for Phi {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "φ{}", self.idx)
}
}
pub struct PhiAllocator {
count: u32,
}
impl PhiAllocator {
pub fn new() -> PhiAllocator {
PhiAllocator { count: 0 }
}
pub fn alloc(&mut self) -> Phi {
let idx = self.count;
self.count = idx + 1;
Phi { idx }
}
}
}
pub use phi::{Phi, PhiAllocator};
/// An instruction which maps [Phi]s to sources in the predecessor block
#[repr(C)]
#[derive(DstsAsSlice)]
pub struct OpPhiSrcs {
pub srcs: VecPair<Phi, Src>,
}
impl OpPhiSrcs {
pub fn new() -> OpPhiSrcs {
OpPhiSrcs {
srcs: VecPair::new(),
}
}
}
impl AsSlice<Src> for OpPhiSrcs {
type Attr = SrcType;
fn as_slice(&self) -> &[Src] {
&self.srcs.b
}
fn as_mut_slice(&mut self) -> &mut [Src] {
&mut self.srcs.b
}
fn attrs(&self) -> SrcTypeList {
SrcTypeList::Uniform(SrcType::GPR)
}
}
impl DisplayOp for OpPhiSrcs {
fn fmt_dsts(&self, _f: &mut fmt::Formatter<'_>) -> fmt::Result {
Ok(())
}
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "phi_src ")?;
for (i, (phi, src)) in self.srcs.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
write!(f, "{phi} = {src}")?;
}
Ok(())
}
}
impl_display_for_op!(OpPhiSrcs);
/// An instruction which maps [Phi]s to destinations in the succeessor block
#[repr(C)]
#[derive(SrcsAsSlice)]
pub struct OpPhiDsts {
pub dsts: VecPair<Phi, Dst>,
}
impl OpPhiDsts {
pub fn new() -> OpPhiDsts {
OpPhiDsts {
dsts: VecPair::new(),
}
}
}
impl AsSlice<Dst> for OpPhiDsts {
type Attr = DstType;
fn as_slice(&self) -> &[Dst] {
&self.dsts.b
}
fn as_mut_slice(&mut self) -> &mut [Dst] {
&mut self.dsts.b
}
fn attrs(&self) -> DstTypeList {
DstTypeList::Uniform(DstType::Vec)
}
}
impl DisplayOp for OpPhiDsts {
fn fmt_dsts(&self, _f: &mut fmt::Formatter<'_>) -> fmt::Result {
Ok(())
}
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "phi_dst ")?;
for (i, (phi, dst)) in self.dsts.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
write!(f, "{dst} = {phi}")?;
}
Ok(())
}
}
impl_display_for_op!(OpPhiDsts);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpCopy {
pub dst: Dst,
pub src: Src,
}
impl DisplayOp for OpCopy {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "copy {}", self.src)
}
}
impl_display_for_op!(OpCopy);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
/// Copies a value and pins its destination in the register file
pub struct OpPin {
pub dst: Dst,
#[src_type(SSA)]
pub src: Src,
}
impl DisplayOp for OpPin {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "pin {}", self.src)
}
}
impl_display_for_op!(OpPin);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
/// Copies a pinned value to an unpinned value
pub struct OpUnpin {
pub dst: Dst,
#[src_type(SSA)]
pub src: Src,
}
impl DisplayOp for OpUnpin {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "unpin {}", self.src)
}
}
impl_display_for_op!(OpUnpin);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpSwap {
pub dsts: [Dst; 2],
pub srcs: [Src; 2],
}
impl DisplayOp for OpSwap {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "swap {} {}", self.srcs[0], self.srcs[1])
}
}
impl_display_for_op!(OpSwap);
#[repr(C)]
pub struct OpParCopy {
pub dsts_srcs: VecPair<Dst, Src>,
pub tmp: Option<RegRef>,
}
impl OpParCopy {
pub fn new() -> OpParCopy {
OpParCopy {
dsts_srcs: VecPair::new(),
tmp: None,
}
}
pub fn is_empty(&self) -> bool {
self.dsts_srcs.is_empty()
}
pub fn push(&mut self, dst: Dst, src: Src) {
self.dsts_srcs.push(dst, src);
}
}
impl AsSlice<Src> for OpParCopy {
type Attr = SrcType;
fn as_slice(&self) -> &[Src] {
&self.dsts_srcs.b
}
fn as_mut_slice(&mut self) -> &mut [Src] {
&mut self.dsts_srcs.b
}
fn attrs(&self) -> SrcTypeList {
SrcTypeList::Uniform(SrcType::GPR)
}
}
impl AsSlice<Dst> for OpParCopy {
type Attr = DstType;
fn as_slice(&self) -> &[Dst] {
&self.dsts_srcs.a
}
fn as_mut_slice(&mut self) -> &mut [Dst] {
&mut self.dsts_srcs.a
}
fn attrs(&self) -> DstTypeList {
DstTypeList::Uniform(DstType::Vec)
}
}
impl DisplayOp for OpParCopy {
fn fmt_dsts(&self, _f: &mut fmt::Formatter<'_>) -> fmt::Result {
Ok(())
}
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "par_copy")?;
for (i, (dst, src)) in self.dsts_srcs.iter().enumerate() {
if i > 0 {
write!(f, ",")?;
}
write!(f, " {} = {}", dst, src)?;
}
Ok(())
}
}
impl_display_for_op!(OpParCopy);
#[repr(C)]
#[derive(DstsAsSlice)]
pub struct OpRegOut {
pub srcs: Vec<Src>,
}
impl AsSlice<Src> for OpRegOut {
type Attr = SrcType;
fn as_slice(&self) -> &[Src] {
&self.srcs
}
fn as_mut_slice(&mut self) -> &mut [Src] {
&mut self.srcs
}
fn attrs(&self) -> SrcTypeList {
SrcTypeList::Uniform(SrcType::GPR)
}
}
impl DisplayOp for OpRegOut {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "reg_out {{")?;
for (i, src) in self.srcs.iter().enumerate() {
if i > 0 {
write!(f, ",")?;
}
write!(f, " {}", src)?;
}
write!(f, " }}")
}
}
impl_display_for_op!(OpRegOut);
#[derive(Copy, Clone, Debug, PartialEq)]
pub enum OutType {
Emit,
Cut,
EmitThenCut,
}
impl fmt::Display for OutType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
OutType::Emit => write!(f, "emit"),
OutType::Cut => write!(f, "cut"),
OutType::EmitThenCut => write!(f, "emit_then_cut"),
}
}
}
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpOut {
pub dst: Dst,
#[src_type(SSA)]
pub handle: Src,
#[src_type(ALU)]
pub stream: Src,
pub out_type: OutType,
}
impl DisplayOp for OpOut {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "out.{} {} {}", self.out_type, self.handle, self.stream)
}
}
impl_display_for_op!(OpOut);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpOutFinal {
#[src_type(SSA)]
pub handle: Src,
}
impl DisplayOp for OpOutFinal {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "out.final {{ {} }}", self.handle)
}
}
impl_display_for_op!(OpOutFinal);
/// Describes an annotation on an instruction.
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpAnnotate {
/// The annotation
pub annotation: String,
}
impl DisplayOp for OpAnnotate {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "// {}", self.annotation)
}
}
impl fmt::Display for OpAnnotate {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.fmt_op(f)
}
}
#[derive(DisplayOp, DstsAsSlice, SrcsAsSlice, FromVariants)]
pub enum Op {
FAdd(Box<OpFAdd>),
FFma(Box<OpFFma>),
FMnMx(Box<OpFMnMx>),
FMul(Box<OpFMul>),
Rro(Box<OpRro>),
MuFu(Box<OpMuFu>),
FSet(Box<OpFSet>),
FSetP(Box<OpFSetP>),
FSwzAdd(Box<OpFSwzAdd>),
FSwz(Box<OpFSwz>),
DAdd(Box<OpDAdd>),
DFma(Box<OpDFma>),
DMnMx(Box<OpDMnMx>),
DMul(Box<OpDMul>),
DSetP(Box<OpDSetP>),
HAdd2(Box<OpHAdd2>),
HFma2(Box<OpHFma2>),
HMul2(Box<OpHMul2>),
HSet2(Box<OpHSet2>),
HSetP2(Box<OpHSetP2>),
Imma(Box<OpImma>),
Hmma(Box<OpHmma>),
Ldsm(Box<OpLdsm>),
HMnMx2(Box<OpHMnMx2>),
BMsk(Box<OpBMsk>),
BRev(Box<OpBRev>),
Bfe(Box<OpBfe>),
Flo(Box<OpFlo>),
IAbs(Box<OpIAbs>),
IAdd2(Box<OpIAdd2>),
IAdd2X(Box<OpIAdd2X>),
IAdd3(Box<OpIAdd3>),
IAdd3X(Box<OpIAdd3X>),
IDp4(Box<OpIDp4>),
IMad(Box<OpIMad>),
IMad64(Box<OpIMad64>),
IMul(Box<OpIMul>),
IMnMx(Box<OpIMnMx>),
ISetP(Box<OpISetP>),
Lea(Box<OpLea>),
LeaX(Box<OpLeaX>),
Lop2(Box<OpLop2>),
Lop3(Box<OpLop3>),
PopC(Box<OpPopC>),
Shf(Box<OpShf>),
Shl(Box<OpShl>),
Shr(Box<OpShr>),
F2F(Box<OpF2F>),
F2FP(Box<OpF2FP>),
F2I(Box<OpF2I>),
I2F(Box<OpI2F>),
I2I(Box<OpI2I>),
FRnd(Box<OpFRnd>),
Mov(Box<OpMov>),
Prmt(Box<OpPrmt>),
Sel(Box<OpSel>),
Shfl(Box<OpShfl>),
PLop3(Box<OpPLop3>),
PSetP(Box<OpPSetP>),
R2UR(Box<OpR2UR>),
Redux(Box<OpRedux>),
Tex(Box<OpTex>),
Tld(Box<OpTld>),
Tld4(Box<OpTld4>),
Tmml(Box<OpTmml>),
Txd(Box<OpTxd>),
Txq(Box<OpTxq>),
SuLd(Box<OpSuLd>),
SuSt(Box<OpSuSt>),
SuAtom(Box<OpSuAtom>),
SuClamp(Box<OpSuClamp>),
SuBfm(Box<OpSuBfm>),
SuEau(Box<OpSuEau>),
IMadSp(Box<OpIMadSp>),
SuLdGa(Box<OpSuLdGa>),
SuStGa(Box<OpSuStGa>),
Ld(Box<OpLd>),
Ldc(Box<OpLdc>),
LdSharedLock(Box<OpLdSharedLock>),
St(Box<OpSt>),
StSCheckUnlock(Box<OpStSCheckUnlock>),
Atom(Box<OpAtom>),
AL2P(Box<OpAL2P>),
ALd(Box<OpALd>),
ASt(Box<OpASt>),
Ipa(Box<OpIpa>),
LdTram(Box<OpLdTram>),
CCtl(Box<OpCCtl>),
MemBar(Box<OpMemBar>),
BClear(Box<OpBClear>),
BMov(Box<OpBMov>),
Break(Box<OpBreak>),
BSSy(Box<OpBSSy>),
BSync(Box<OpBSync>),
Bra(Box<OpBra>),
SSy(OpSSy),
Sync(OpSync),
Brk(OpBrk),
PBk(OpPBk),
Cont(OpCont),
PCnt(OpPCnt),
Exit(OpExit),
WarpSync(Box<OpWarpSync>),
Bar(Box<OpBar>),
TexDepBar(Box<OpTexDepBar>),
CS2R(Box<OpCS2R>),
Isberd(Box<OpIsberd>),
ViLd(Box<OpViLd>),
Kill(Box<OpKill>),
Nop(OpNop),
PixLd(Box<OpPixLd>),
S2R(Box<OpS2R>),
Vote(Box<OpVote>),
Match(Box<OpMatch>),
Undef(Box<OpUndef>),
SrcBar(Box<OpSrcBar>),
PhiSrcs(Box<OpPhiSrcs>),
PhiDsts(Box<OpPhiDsts>),
Copy(Box<OpCopy>),
Pin(Box<OpPin>),
Unpin(Box<OpUnpin>),
Swap(Box<OpSwap>),
ParCopy(Box<OpParCopy>),
RegOut(Box<OpRegOut>),
Out(Box<OpOut>),
OutFinal(Box<OpOutFinal>),
Annotate(Box<OpAnnotate>),
}
impl_display_for_op!(Op);
#[cfg(target_arch = "x86_64")]
const _: () = {
debug_assert!(size_of::<Op>() == 16);
};
impl Op {
pub fn is_branch(&self) -> bool {
match self {
Op::Bra(_)
| Op::Sync(_)
| Op::Brk(_)
| Op::Cont(_)
| Op::Exit(_) => true,
_ => false,
}
}
pub fn is_fp64(&self) -> bool {
match self {
Op::MuFu(op) => matches!(op.op, MuFuOp::Rcp64H | MuFuOp::Rsq64H),
Op::DAdd(_)
| Op::DFma(_)
| Op::DMnMx(_)
| Op::DMul(_)
| Op::DSetP(_) => true,
Op::F2F(op) => op.src_type.bits() == 64 || op.dst_type.bits() == 64,
Op::F2I(op) => op.src_type.bits() == 64 || op.dst_type.bits() == 64,
Op::I2F(op) => op.src_type.bits() == 64 || op.dst_type.bits() == 64,
Op::FRnd(op) => {
op.src_type.bits() == 64 || op.dst_type.bits() == 64
}
_ => false,
}
}
pub fn has_fixed_latency(&self, sm: u8) -> bool {
match self {
// Float ALU
Op::F2FP(_)
| Op::FAdd(_)
| Op::FFma(_)
| Op::FMnMx(_)
| Op::FMul(_)
| Op::FSet(_)
| Op::FSetP(_)
| Op::HAdd2(_)
| Op::HFma2(_)
| Op::HMul2(_)
| Op::HSet2(_)
| Op::HSetP2(_)
| Op::HMnMx2(_)
| Op::FSwz(_)
| Op::FSwzAdd(_) => true,
// Multi-function unit is variable latency
Op::Rro(_) | Op::MuFu(_) => false,
// Double-precision float ALU
Op::DAdd(_)
| Op::DFma(_)
| Op::DMnMx(_)
| Op::DMul(_)
| Op::DSetP(_) => false,
// Matrix Multiply Add
Op::Imma(_) | Op::Hmma(_) | Op::Ldsm(_) => false,
// Integer ALU
Op::BRev(_) | Op::Flo(_) | Op::PopC(_) => false,
Op::IMad(_) | Op::IMul(_) => sm >= 70,
Op::BMsk(_)
| Op::IAbs(_)
| Op::IAdd2(_)
| Op::IAdd2X(_)
| Op::IAdd3(_)
| Op::IAdd3X(_)
| Op::IDp4(_)
| Op::IMad64(_)
| Op::IMnMx(_)
| Op::ISetP(_)
| Op::Lea(_)
| Op::LeaX(_)
| Op::Lop2(_)
| Op::Lop3(_)
| Op::SuClamp(_)
| Op::SuBfm(_)
| Op::SuEau(_)
| Op::IMadSp(_)
| Op::Shf(_)
| Op::Shl(_)
| Op::Shr(_)
| Op::Bfe(_) => true,
// Conversions are variable latency?!?
Op::F2F(_) | Op::F2I(_) | Op::I2F(_) | Op::I2I(_) | Op::FRnd(_) => {
false
}
// Move ops
Op::Mov(_) | Op::Prmt(_) | Op::Sel(_) => true,
Op::Shfl(_) => false,
// Predicate ops
Op::PLop3(_) | Op::PSetP(_) => true,
// Uniform ops
Op::R2UR(_) | Op::Redux(_) => false,
// Texture ops
Op::Tex(_)
| Op::Tld(_)
| Op::Tld4(_)
| Op::Tmml(_)
| Op::Txd(_)
| Op::Txq(_) => false,
// Surface ops
Op::SuLd(_)
| Op::SuSt(_)
| Op::SuAtom(_)
| Op::SuLdGa(_)
| Op::SuStGa(_) => false,
// Memory ops
Op::Ld(_)
| Op::Ldc(_)
| Op::LdSharedLock(_)
| Op::St(_)
| Op::StSCheckUnlock(_)
| Op::Atom(_)
| Op::AL2P(_)
| Op::ALd(_)
| Op::ASt(_)
| Op::Ipa(_)
| Op::CCtl(_)
| Op::LdTram(_)
| Op::MemBar(_) => false,
// Control-flow ops
Op::BClear(_)
| Op::Break(_)
| Op::BSSy(_)
| Op::BSync(_)
| Op::SSy(_)
| Op::Sync(_)
| Op::Brk(_)
| Op::PBk(_)
| Op::Cont(_)
| Op::PCnt(_)
| Op::Bra(_)
| Op::Exit(_)
| Op::WarpSync(_) => false,
// The barrier half is HW scoreboarded by the GPR isn't. When
// moving from a GPR to a barrier, we still need a token for WaR
// hazards.
Op::BMov(_) => false,
// Geometry ops
Op::Out(_) | Op::OutFinal(_) => false,
// Miscellaneous ops
Op::Bar(_)
| Op::TexDepBar(_)
| Op::CS2R(_)
| Op::Isberd(_)
| Op::ViLd(_)
| Op::Kill(_)
| Op::PixLd(_)
| Op::S2R(_)
| Op::Match(_) => false,
Op::Nop(_) | Op::Vote(_) => true,
// Virtual ops
Op::Undef(_)
| Op::SrcBar(_)
| Op::PhiSrcs(_)
| Op::PhiDsts(_)
| Op::Copy(_)
| Op::Pin(_)
| Op::Unpin(_)
| Op::Swap(_)
| Op::ParCopy(_)
| Op::RegOut(_)
| Op::Annotate(_) => {
panic!("Not a hardware opcode")
}
}
}
/// Some decoupled instructions don't need
/// scoreboards, due to our usage.
pub fn no_scoreboard(&self) -> bool {
match self {
Op::BClear(_)
| Op::Break(_)
| Op::BSSy(_)
| Op::BSync(_)
| Op::SSy(_)
| Op::Sync(_)
| Op::Brk(_)
| Op::PBk(_)
| Op::Cont(_)
| Op::PCnt(_)
| Op::Bra(_)
| Op::Exit(_) => true,
_ => false,
}
}
}
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub enum PredRef {
None,
SSA(SSAValue),
Reg(RegRef),
}
impl PredRef {
#[allow(dead_code)]
pub fn as_reg(&self) -> Option<&RegRef> {
match self {
PredRef::Reg(r) => Some(r),
_ => None,
}
}
#[allow(dead_code)]
pub fn as_ssa(&self) -> Option<&SSAValue> {
match self {
PredRef::SSA(r) => Some(r),
_ => None,
}
}
pub fn is_none(&self) -> bool {
matches!(self, PredRef::None)
}
pub fn iter_ssa(&self) -> slice::Iter<'_, SSAValue> {
match self {
PredRef::None | PredRef::Reg(_) => &[],
PredRef::SSA(ssa) => slice::from_ref(ssa),
}
.iter()
}
pub fn iter_ssa_mut(&mut self) -> slice::IterMut<'_, SSAValue> {
match self {
PredRef::None | PredRef::Reg(_) => &mut [],
PredRef::SSA(ssa) => slice::from_mut(ssa),
}
.iter_mut()
}
}
impl From<RegRef> for PredRef {
fn from(reg: RegRef) -> PredRef {
PredRef::Reg(reg)
}
}
impl From<SSAValue> for PredRef {
fn from(ssa: SSAValue) -> PredRef {
PredRef::SSA(ssa)
}
}
impl fmt::Display for PredRef {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
PredRef::None => write!(f, "pT"),
PredRef::SSA(ssa) => ssa.fmt(f),
PredRef::Reg(reg) => reg.fmt(f),
}
}
}
#[derive(Clone, Copy)]
pub struct Pred {
pub pred_ref: PredRef,
pub pred_inv: bool,
}
impl Pred {
pub fn is_true(&self) -> bool {
self.pred_ref.is_none() && !self.pred_inv
}
pub fn is_false(&self) -> bool {
self.pred_ref.is_none() && self.pred_inv
}
pub fn iter_ssa(&self) -> slice::Iter<'_, SSAValue> {
self.pred_ref.iter_ssa()
}
pub fn iter_ssa_mut(&mut self) -> slice::IterMut<'_, SSAValue> {
self.pred_ref.iter_ssa_mut()
}
pub fn bnot(self) -> Self {
Pred {
pred_ref: self.pred_ref,
pred_inv: !self.pred_inv,
}
}
}
impl From<bool> for Pred {
fn from(b: bool) -> Self {
Pred {
pred_ref: PredRef::None,
pred_inv: !b,
}
}
}
impl<T: Into<PredRef>> From<T> for Pred {
fn from(p: T) -> Self {
Pred {
pred_ref: p.into(),
pred_inv: false,
}
}
}
impl fmt::Display for Pred {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.pred_inv {
write!(f, "!")?;
}
self.pred_ref.fmt(f)
}
}
pub const MIN_INSTR_DELAY: u8 = 1;
pub struct InstrDeps {
pub delay: u8,
pub yld: bool,
wr_bar: i8,
rd_bar: i8,
pub wt_bar_mask: u8,
pub reuse_mask: u8,
}
impl InstrDeps {
pub fn new() -> InstrDeps {
InstrDeps {
delay: 0,
yld: false,
wr_bar: -1,
rd_bar: -1,
wt_bar_mask: 0,
reuse_mask: 0,
}
}
pub fn rd_bar(&self) -> Option<u8> {
if self.rd_bar < 0 {
None
} else {
Some(self.rd_bar.try_into().unwrap())
}
}
pub fn wr_bar(&self) -> Option<u8> {
if self.wr_bar < 0 {
None
} else {
Some(self.wr_bar.try_into().unwrap())
}
}
pub fn set_delay(&mut self, delay: u8) {
self.delay = delay;
}
pub fn set_yield(&mut self, yld: bool) {
self.yld = yld;
}
pub fn set_rd_bar(&mut self, idx: u8) {
assert!(idx < 6);
self.rd_bar = idx.try_into().unwrap();
}
pub fn set_wr_bar(&mut self, idx: u8) {
assert!(idx < 6);
self.wr_bar = idx.try_into().unwrap();
}
pub fn add_wt_bar(&mut self, idx: u8) {
self.add_wt_bar_mask(1 << idx);
}
pub fn add_wt_bar_mask(&mut self, bar_mask: u8) {
assert!(bar_mask < 1 << 6);
self.wt_bar_mask |= bar_mask;
}
#[allow(dead_code)]
pub fn add_reuse(&mut self, idx: u8) {
assert!(idx < 6);
self.reuse_mask |= 1_u8 << idx;
}
}
impl fmt::Display for InstrDeps {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.delay > 0 {
write!(f, " delay={}", self.delay)?;
}
if self.wt_bar_mask != 0 {
write!(f, " wt={:06b}", self.wt_bar_mask)?;
}
if self.rd_bar >= 0 {
write!(f, " rd:{}", self.rd_bar)?;
}
if self.wr_bar >= 0 {
write!(f, " wr:{}", self.wr_bar)?;
}
if self.reuse_mask != 0 {
write!(f, " reuse={:06b}", self.reuse_mask)?;
}
if self.yld {
write!(f, " yld")?;
}
Ok(())
}
}
pub struct Instr {
pub pred: Pred,
pub op: Op,
pub deps: InstrDeps,
}
impl Instr {
pub fn new(op: impl Into<Op>) -> Self {
Self {
op: op.into(),
pred: true.into(),
deps: InstrDeps::new(),
}
}
pub fn dsts(&self) -> &[Dst] {
self.op.dsts_as_slice()
}
pub fn dsts_mut(&mut self) -> &mut [Dst] {
self.op.dsts_as_mut_slice()
}
pub fn srcs(&self) -> &[Src] {
self.op.srcs_as_slice()
}
pub fn srcs_mut(&mut self) -> &mut [Src] {
self.op.srcs_as_mut_slice()
}
pub fn src_types(&self) -> SrcTypeList {
self.op.src_types()
}
pub fn for_each_ssa_use(&self, mut f: impl FnMut(&SSAValue)) {
for ssa in self.pred.iter_ssa() {
f(ssa);
}
for src in self.srcs() {
for ssa in src.iter_ssa() {
f(ssa);
}
}
}
pub fn for_each_ssa_use_mut(&mut self, mut f: impl FnMut(&mut SSAValue)) {
for ssa in self.pred.iter_ssa_mut() {
f(ssa);
}
for src in self.srcs_mut() {
for ssa in src.iter_ssa_mut() {
f(ssa);
}
}
}
pub fn for_each_ssa_def(&self, mut f: impl FnMut(&SSAValue)) {
for dst in self.dsts() {
for ssa in dst.iter_ssa() {
f(ssa);
}
}
}
pub fn for_each_ssa_def_mut(&mut self, mut f: impl FnMut(&mut SSAValue)) {
for dst in self.dsts_mut() {
for ssa in dst.iter_ssa_mut() {
f(ssa);
}
}
}
pub fn is_branch(&self) -> bool {
self.op.is_branch()
}
/// Returns true if `self`` is a branch instruction that is always taken.
/// It returns false for non branch instructions.
pub fn is_branch_always_taken(&self) -> bool {
if self.pred.is_true() {
match &self.op {
Op::Bra(bra) => bra.cond.is_true(),
_ => self.is_branch(),
}
} else {
false
}
}
pub fn uses_global_mem(&self) -> bool {
match &self.op {
Op::Atom(op) => op.mem_space != MemSpace::Local,
Op::Ld(op) => op.access.space != MemSpace::Local,
Op::St(op) => op.access.space != MemSpace::Local,
Op::SuAtom(_)
| Op::SuLd(_)
| Op::SuSt(_)
| Op::SuLdGa(_)
| Op::SuStGa(_) => true,
_ => false,
}
}
pub fn writes_global_mem(&self) -> bool {
match &self.op {
Op::Atom(op) => matches!(op.mem_space, MemSpace::Global(_)),
Op::St(op) => matches!(op.access.space, MemSpace::Global(_)),
Op::SuAtom(_) | Op::SuSt(_) | Op::SuStGa(_) => true,
_ => false,
}
}
pub fn can_eliminate(&self) -> bool {
match &self.op {
Op::ASt(_)
| Op::SuSt(_)
| Op::SuStGa(_)
| Op::SuAtom(_)
| Op::LdSharedLock(_)
| Op::St(_)
| Op::StSCheckUnlock(_)
| Op::Atom(_)
| Op::CCtl(_)
| Op::MemBar(_)
| Op::Kill(_)
| Op::Nop(_)
| Op::BSync(_)
| Op::Bra(_)
| Op::SSy(_)
| Op::Sync(_)
| Op::Brk(_)
| Op::PBk(_)
| Op::Cont(_)
| Op::PCnt(_)
| Op::Exit(_)
| Op::WarpSync(_)
| Op::Bar(_)
| Op::TexDepBar(_)
| Op::RegOut(_)
| Op::Out(_)
| Op::OutFinal(_)
| Op::Annotate(_) => false,
Op::BMov(op) => !op.clear,
_ => true,
}
}
pub fn is_uniform(&self) -> bool {
match &self.op {
Op::PhiDsts(_) => false,
op => op.is_uniform(),
}
}
pub fn needs_yield(&self) -> bool {
matches!(&self.op, Op::Bar(_) | Op::BSync(_))
}
fn fmt_pred(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if !self.pred.is_true() {
write!(f, "@{} ", self.pred)?;
}
Ok(())
}
}
impl fmt::Display for Instr {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{} {}{}", Fmt(|f| self.fmt_pred(f)), self.op, self.deps)
}
}
impl<T: Into<Op>> From<T> for Instr {
fn from(value: T) -> Self {
Self::new(value)
}
}
pub type MappedInstrs = SmallVec<Instr>;
pub struct BasicBlock {
pub label: Label,
/// Whether or not this block is uniform
///
/// If true, then all non-exited lanes in a warp which execute this block
/// are guaranteed to execute it together
pub uniform: bool,
pub instrs: Vec<Instr>,
}
impl BasicBlock {
pub fn map_instrs(&mut self, mut map: impl FnMut(Instr) -> MappedInstrs) {
let mut instrs = Vec::new();
for i in self.instrs.drain(..) {
match map(i) {
MappedInstrs::None => (),
MappedInstrs::One(i) => {
instrs.push(i);
}
MappedInstrs::Many(mut v) => {
instrs.append(&mut v);
}
}
}
self.instrs = instrs;
}
pub fn phi_dsts_ip(&self) -> Option<usize> {
for (ip, instr) in self.instrs.iter().enumerate() {
match &instr.op {
Op::Annotate(_) => (),
Op::PhiDsts(_) => return Some(ip),
_ => break,
}
}
None
}
pub fn phi_dsts(&self) -> Option<&OpPhiDsts> {
self.phi_dsts_ip().map(|ip| match &self.instrs[ip].op {
Op::PhiDsts(phi) => phi.deref(),
_ => panic!("Expected to find the phi"),
})
}
#[allow(dead_code)]
pub fn phi_dsts_mut(&mut self) -> Option<&mut OpPhiDsts> {
self.phi_dsts_ip().map(|ip| match &mut self.instrs[ip].op {
Op::PhiDsts(phi) => phi.deref_mut(),
_ => panic!("Expected to find the phi"),
})
}
pub fn phi_srcs_ip(&self) -> Option<usize> {
for (ip, instr) in self.instrs.iter().enumerate().rev() {
match &instr.op {
Op::Annotate(_) => (),
Op::PhiSrcs(_) => return Some(ip),
_ if instr.is_branch() => (),
_ => break,
}
}
None
}
pub fn phi_srcs(&self) -> Option<&OpPhiSrcs> {
self.phi_srcs_ip().map(|ip| match &self.instrs[ip].op {
Op::PhiSrcs(phi) => phi.deref(),
_ => panic!("Expected to find the phi"),
})
}
pub fn phi_srcs_mut(&mut self) -> Option<&mut OpPhiSrcs> {
self.phi_srcs_ip().map(|ip| match &mut self.instrs[ip].op {
Op::PhiSrcs(phi) => phi.deref_mut(),
_ => panic!("Expected to find the phi"),
})
}
pub fn branch(&self) -> Option<&Instr> {
if let Some(i) = self.instrs.last() {
if i.is_branch() {
Some(i)
} else {
None
}
} else {
None
}
}
pub fn branch_ip(&self) -> Option<usize> {
if let Some(i) = self.instrs.last() {
if i.is_branch() {
Some(self.instrs.len() - 1)
} else {
None
}
} else {
None
}
}
#[allow(dead_code)]
pub fn branch_mut(&mut self) -> Option<&mut Instr> {
if let Some(i) = self.instrs.last_mut() {
if i.is_branch() {
Some(i)
} else {
None
}
} else {
None
}
}
pub fn falls_through(&self) -> bool {
if let Some(i) = self.branch() {
!i.is_branch_always_taken()
} else {
true
}
}
}
pub struct Function {
pub ssa_alloc: SSAValueAllocator,
pub phi_alloc: PhiAllocator,
pub blocks: CFG<BasicBlock>,
}
impl Function {
pub fn map_instrs(
&mut self,
mut map: impl FnMut(Instr, &mut SSAValueAllocator) -> MappedInstrs,
) {
let alloc = &mut self.ssa_alloc;
for b in &mut self.blocks {
b.map_instrs(|i| map(i, alloc));
}
}
}
impl fmt::Display for Function {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut pred_width = 0;
let mut dsts_width = 0;
let mut op_width = 0;
let mut blocks = Vec::new();
for b in &self.blocks {
let mut instrs = Vec::new();
for i in &b.instrs {
let mut pred = String::new();
write!(pred, "{}", Fmt(|f| i.fmt_pred(f)))?;
let mut dsts = String::new();
write!(dsts, "{}", Fmt(|f| i.op.fmt_dsts(f)))?;
let mut op = String::new();
write!(op, "{}", Fmt(|f| i.op.fmt_op(f)))?;
let mut deps = String::new();
write!(deps, "{}", i.deps)?;
pred_width = max(pred_width, pred.len());
dsts_width = max(dsts_width, dsts.len());
op_width = max(op_width, op.len());
let is_annotation = matches!(i.op, Op::Annotate(_));
instrs.push((pred, dsts, op, deps, is_annotation));
}
blocks.push(instrs);
}
for (i, mut b) in blocks.drain(..).enumerate() {
let u = if self.blocks[i].uniform { ".u" } else { "" };
write!(f, "block{u} {} {} [", i, self.blocks[i].label)?;
for (pi, p) in self.blocks.pred_indices(i).iter().enumerate() {
if pi > 0 {
write!(f, ", ")?;
}
write!(f, "{}", p)?;
}
write!(f, "] -> {{\n")?;
for (pred, dsts, op, deps, is_annotation) in b.drain(..) {
let eq_sym = if dsts.is_empty() { " " } else { "=" };
if is_annotation {
write!(f, "\n{}\n", op)?;
} else if deps.is_empty() {
write!(
f,
"{:<pred_width$} {:<dsts_width$} {} {}\n",
pred, dsts, eq_sym, op,
)?;
} else {
write!(
f,
"{:<pred_width$} {:<dsts_width$} {} \
{:<op_width$} //{}\n",
pred, dsts, eq_sym, op, deps,
)?;
}
}
write!(f, "}} -> [")?;
for (si, s) in self.blocks.succ_indices(i).iter().enumerate() {
if si > 0 {
write!(f, ", ")?;
}
write!(f, "{}", s)?;
}
write!(f, "]\n")?;
}
Ok(())
}
}
#[derive(Debug)]
pub struct ComputeShaderInfo {
pub local_size: [u16; 3],
pub smem_size: u16,
}
#[derive(Debug)]
pub struct FragmentShaderInfo {
pub uses_kill: bool,
pub does_interlock: bool,
pub post_depth_coverage: bool,
pub early_fragment_tests: bool,
pub uses_sample_shading: bool,
}
#[derive(Debug)]
pub struct GeometryShaderInfo {
pub passthrough_enable: bool,
pub stream_out_mask: u8,
pub threads_per_input_primitive: u8,
pub output_topology: OutputTopology,
pub max_output_vertex_count: u16,
}
impl Default for GeometryShaderInfo {
fn default() -> Self {
Self {
passthrough_enable: false,
stream_out_mask: 0,
threads_per_input_primitive: 0,
output_topology: OutputTopology::LineStrip,
max_output_vertex_count: 0,
}
}
}
#[derive(Debug)]
pub struct TessellationInitShaderInfo {
pub per_patch_attribute_count: u8,
pub threads_per_patch: u8,
}
#[repr(u8)]
#[derive(Clone, Copy, Debug)]
pub enum TessellationDomain {
Isoline = NAK_TS_DOMAIN_ISOLINE,
Triangle = NAK_TS_DOMAIN_TRIANGLE,
Quad = NAK_TS_DOMAIN_QUAD,
}
#[repr(u8)]
#[derive(Clone, Copy, Debug)]
pub enum TessellationSpacing {
Integer = NAK_TS_SPACING_INTEGER,
FractionalOdd = NAK_TS_SPACING_FRACT_ODD,
FractionalEven = NAK_TS_SPACING_FRACT_EVEN,
}
#[repr(u8)]
#[derive(Clone, Copy, Debug)]
pub enum TessellationPrimitives {
Points = NAK_TS_PRIMS_POINTS,
Lines = NAK_TS_PRIMS_LINES,
TrianglesCW = NAK_TS_PRIMS_TRIANGLES_CW,
TrianglesCCW = NAK_TS_PRIMS_TRIANGLES_CCW,
}
#[derive(Debug)]
pub struct TessellationShaderInfo {
pub domain: TessellationDomain,
pub spacing: TessellationSpacing,
pub primitives: TessellationPrimitives,
}
#[derive(Debug)]
pub enum ShaderStageInfo {
Compute(ComputeShaderInfo),
Vertex,
Fragment(FragmentShaderInfo),
Geometry(GeometryShaderInfo),
TessellationInit(TessellationInitShaderInfo),
Tessellation(TessellationShaderInfo),
}
#[derive(Debug, Default)]
pub struct SysValInfo {
pub ab: u32,
pub c: u16,
}
#[derive(Debug)]
pub struct VtgIoInfo {
pub sysvals_in: SysValInfo,
pub sysvals_in_d: u8,
pub sysvals_out: SysValInfo,
pub sysvals_out_d: u8,
pub attr_in: [u32; 4],
pub attr_out: [u32; 4],
pub store_req_start: u8,
pub store_req_end: u8,
pub clip_enable: u8,
pub cull_enable: u8,
pub xfb: Option<Box<nak_xfb_info>>,
}
impl VtgIoInfo {
fn mark_attrs(&mut self, addrs: Range<u16>, written: bool) {
let sysvals = if written {
&mut self.sysvals_out
} else {
&mut self.sysvals_in
};
let sysvals_d = if written {
&mut self.sysvals_out_d
} else {
&mut self.sysvals_in_d
};
let mut attr = BitMutView::new(if written {
&mut self.attr_out
} else {
&mut self.attr_in
});
let mut addrs = addrs;
addrs.start &= !3;
for addr in addrs.step_by(4) {
if addr < 0x080 {
sysvals.ab |= 1 << (addr / 4);
} else if addr < 0x280 {
let attr_idx = (addr - 0x080) as usize / 4;
attr.set_bit(attr_idx, true);
} else if addr < 0x2c0 {
panic!("FF color I/O not supported");
} else if addr < 0x300 {
sysvals.c |= 1 << ((addr - 0x2c0) / 4);
} else if addr >= 0x3a0 && addr < 0x3c0 {
*sysvals_d |= 1 << ((addr - 0x3a0) / 4);
}
}
}
pub fn mark_attrs_read(&mut self, addrs: Range<u16>) {
self.mark_attrs(addrs, false);
}
pub fn mark_attrs_written(&mut self, addrs: Range<u16>) {
self.mark_attrs(addrs, true);
}
pub fn attr_written(&self, addr: u16) -> bool {
if addr < 0x080 {
self.sysvals_out.ab & (1 << (addr / 4)) != 0
} else if addr < 0x280 {
let attr_idx = (addr - 0x080) as usize / 4;
BitView::new(&self.attr_out).get_bit(attr_idx)
} else if addr < 0x2c0 {
panic!("FF color I/O not supported");
} else if addr < 0x300 {
self.sysvals_out.c & (1 << ((addr - 0x2c0) / 4)) != 0
} else if addr >= 0x3a0 && addr < 0x3c0 {
self.sysvals_out_d & (1 << ((addr - 0x3a0) / 4)) != 0
} else {
panic!("Unknown I/O address");
}
}
pub fn mark_store_req(&mut self, addrs: Range<u16>) {
let start = (addrs.start / 4).try_into().unwrap();
let end = ((addrs.end - 1) / 4).try_into().unwrap();
self.store_req_start = min(self.store_req_start, start);
self.store_req_end = max(self.store_req_end, end);
}
}
#[derive(Debug)]
pub struct FragmentIoInfo {
pub sysvals_in: SysValInfo,
pub sysvals_in_d: [PixelImap; 8],
pub attr_in: [PixelImap; 128],
pub barycentric_attr_in: [u32; 4],
pub reads_sample_mask: bool,
pub writes_color: u32,
pub writes_sample_mask: bool,
pub writes_depth: bool,
}
impl FragmentIoInfo {
pub fn mark_attr_read(&mut self, addr: u16, interp: PixelImap) {
if addr < 0x080 {
self.sysvals_in.ab |= 1 << (addr / 4);
} else if addr < 0x280 {
let attr_idx = (addr - 0x080) as usize / 4;
self.attr_in[attr_idx] = interp;
} else if addr < 0x2c0 {
panic!("FF color I/O not supported");
} else if addr < 0x300 {
self.sysvals_in.c |= 1 << ((addr - 0x2c0) / 4);
} else if addr >= 0x3a0 && addr < 0x3c0 {
let attr_idx = (addr - 0x3a0) as usize / 4;
self.sysvals_in_d[attr_idx] = interp;
}
}
pub fn mark_barycentric_attr_in(&mut self, addr: u16) {
assert!(addr >= 0x80 && addr < 0x280);
let mut attr = BitMutView::new(&mut self.barycentric_attr_in);
let attr_idx = (addr - 0x080) as usize / 4;
attr.set_bit(attr_idx, true);
}
}
#[derive(Debug)]
pub enum ShaderIoInfo {
None,
Vtg(VtgIoInfo),
Fragment(FragmentIoInfo),
}
#[derive(Debug)]
pub struct ShaderInfo {
pub max_warps_per_sm: u32,
pub num_gprs: u8,
pub num_control_barriers: u8,
pub num_instrs: u32,
pub num_static_cycles: u64,
pub num_spills_to_mem: u32,
pub num_fills_from_mem: u32,
pub num_spills_to_reg: u32,
pub num_fills_from_reg: u32,
pub slm_size: u32,
pub max_crs_depth: u32,
pub uses_global_mem: bool,
pub writes_global_mem: bool,
pub uses_fp64: bool,
pub stage: ShaderStageInfo,
pub io: ShaderIoInfo,
}
pub trait ShaderModel {
fn sm(&self) -> u8;
#[allow(dead_code)]
fn is_fermi(&self) -> bool {
self.sm() >= 20 && self.sm() < 30
}
#[allow(dead_code)]
fn is_kepler_a(&self) -> bool {
self.sm() >= 30 && self.sm() < 32
}
#[allow(dead_code)]
fn is_kepler_b(&self) -> bool {
// TK1 is SM 3.2 and desktop Kepler B is SM 3.3+
self.sm() >= 32 && self.sm() < 40
}
#[allow(dead_code)]
fn is_kepler(&self) -> bool {
self.is_kepler_a() || self.is_kepler_b()
}
// The following helpers are pulled from GetSpaVersion in the open-source
// NVIDIA kernel driver sources
#[allow(dead_code)]
fn is_maxwell(&self) -> bool {
self.sm() >= 50 && self.sm() < 60
}
#[allow(dead_code)]
fn is_pascal(&self) -> bool {
self.sm() >= 60 && self.sm() < 70
}
#[allow(dead_code)]
fn is_volta(&self) -> bool {
self.sm() >= 70 && self.sm() < 73
}
#[allow(dead_code)]
fn is_turing(&self) -> bool {
self.sm() >= 73 && self.sm() < 80
}
#[allow(dead_code)]
fn is_ampere(&self) -> bool {
self.sm() >= 80 && self.sm() < 89
}
#[allow(dead_code)]
fn is_ada(&self) -> bool {
self.sm() == 89
}
#[allow(dead_code)]
fn is_hopper(&self) -> bool {
self.sm() >= 90 && self.sm() < 100
}
#[allow(dead_code)]
fn is_blackwell_a(&self) -> bool {
self.sm() >= 100 && self.sm() < 110
}
#[allow(dead_code)]
fn is_blackwell_b(&self) -> bool {
self.sm() >= 120 && self.sm() < 130
}
#[allow(dead_code)]
fn is_blackwell(&self) -> bool {
self.is_blackwell_a() || self.is_blackwell_b()
}
fn num_regs(&self, file: RegFile) -> u32;
fn hw_reserved_gprs(&self) -> u32;
fn crs_size(&self, max_crs_depth: u32) -> u32;
fn op_can_be_uniform(&self, op: &Op) -> bool;
// Scheduling information
fn op_needs_scoreboard(&self, op: &Op) -> bool {
!op.no_scoreboard() && !op.has_fixed_latency(self.sm())
}
/// Latency before another non-NOP can execute
fn exec_latency(&self, op: &Op) -> u32;
/// Read-after-read latency
fn raw_latency(
&self,
write: &Op,
dst_idx: usize,
read: &Op,
src_idx: usize,
) -> u32;
/// Write-after-read latency
fn war_latency(
&self,
read: &Op,
src_idx: usize,
write: &Op,
dst_idx: usize,
) -> u32;
/// Write-after-write latency
fn waw_latency(
&self,
a: &Op,
a_dst_idx: usize,
a_has_pred: bool,
b: &Op,
b_dst_idx: usize,
) -> u32;
/// Predicate read-after-write latency
fn paw_latency(&self, write: &Op, dst_idx: usize) -> u32;
/// Worst-case access-after-write latency
fn worst_latency(&self, write: &Op, dst_idx: usize) -> u32;
/// Maximum encodable instruction delay
fn max_instr_delay(&self) -> u8;
fn legalize_op(&self, b: &mut LegalizeBuilder, op: &mut Op);
fn encode_shader(&self, s: &Shader<'_>) -> Vec<u32>;
}
/// For compute shaders, large values of local_size impose an additional limit
/// on the number of GPRs per thread
pub fn gpr_limit_from_local_size(local_size: &[u16; 3]) -> u32 {
fn prev_multiple_of(x: u32, y: u32) -> u32 {
(x / y) * y
}
let local_size = local_size[0] * local_size[1] * local_size[2];
// Warps are allocated in multiples of 4
// Multiply that by 32 threads/warp
let local_size = local_size.next_multiple_of(4 * 32) as u32;
let total_regs: u32 = 65536;
let out = total_regs / local_size;
// GPRs are allocated in multiples of 8
let out = prev_multiple_of(out, 8);
min(out, 255)
}
pub fn max_warps_per_sm(gprs: u32) -> u32 {
fn prev_multiple_of(x: u32, y: u32) -> u32 {
(x / y) * y
}
// TODO: Take local_size and shared mem limit into account for compute
let total_regs: u32 = 65536;
// GPRs are allocated in multiples of 8
let gprs = gprs.next_multiple_of(8);
let max_warps = prev_multiple_of((total_regs / 32) / gprs, 4);
min(max_warps, 48)
}
pub struct Shader<'a> {
pub sm: &'a dyn ShaderModel,
pub info: ShaderInfo,
pub functions: Vec<Function>,
}
impl Shader<'_> {
pub fn for_each_instr(&self, f: &mut impl FnMut(&Instr)) {
for func in &self.functions {
for b in &func.blocks {
for i in &b.instrs {
f(i);
}
}
}
}
pub fn map_instrs(
&mut self,
mut map: impl FnMut(Instr, &mut SSAValueAllocator) -> MappedInstrs,
) {
for f in &mut self.functions {
f.map_instrs(&mut map);
}
}
/// Remove all annotations, presumably before encoding the shader.
pub fn remove_annotations(&mut self) {
self.map_instrs(|instr: Instr, _| -> MappedInstrs {
if matches!(instr.op, Op::Annotate(_)) {
MappedInstrs::None
} else {
MappedInstrs::One(instr)
}
})
}
pub fn gather_info(&mut self) {
let mut num_instrs = 0;
let mut uses_global_mem = false;
let mut writes_global_mem = false;
let mut uses_fp64 = false;
self.for_each_instr(&mut |instr| {
num_instrs += 1;
if !uses_global_mem {
uses_global_mem = instr.uses_global_mem();
}
if !writes_global_mem {
writes_global_mem = instr.writes_global_mem();
}
if !uses_fp64 {
uses_fp64 = instr.op.is_fp64();
}
});
self.info.num_instrs = num_instrs;
self.info.uses_global_mem = uses_global_mem;
self.info.writes_global_mem = writes_global_mem;
self.info.uses_fp64 = uses_fp64;
self.info.max_warps_per_sm = max_warps_per_sm(
self.info.num_gprs as u32 + self.sm.hw_reserved_gprs(),
);
}
}
impl fmt::Display for Shader<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
for func in &self.functions {
write!(f, "{}", func)?;
}
Ok(())
}
}