| // Copyright © 2022 Collabora, Ltd. |
| // SPDX-License-Identifier: MIT |
| |
| extern crate bitview; |
| extern crate nak_ir_proc; |
| |
| use bitview::{BitMutView, BitMutViewable, BitView, BitViewable, SetField}; |
| use nak_bindings::*; |
| |
| pub use crate::builder::{Builder, InstrBuilder, SSABuilder, SSAInstrBuilder}; |
| use crate::legalize::LegalizeBuilder; |
| use crate::sph::{OutputTopology, PixelImap}; |
| pub use crate::ssa_value::*; |
| use compiler::as_slice::*; |
| use compiler::cfg::CFG; |
| use compiler::smallvec::SmallVec; |
| use nak_ir_proc::*; |
| use std::cmp::{max, min}; |
| use std::fmt; |
| use std::fmt::Write; |
| use std::iter::Zip; |
| use std::ops::{BitAnd, BitOr, Deref, DerefMut, Index, IndexMut, Not, Range}; |
| use std::slice; |
| |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub struct Label { |
| idx: u32, |
| } |
| |
| impl fmt::Display for Label { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "L{}", self.idx) |
| } |
| } |
| |
| pub struct LabelAllocator { |
| count: u32, |
| } |
| |
| impl LabelAllocator { |
| pub fn new() -> LabelAllocator { |
| LabelAllocator { count: 0 } |
| } |
| |
| pub fn alloc(&mut self) -> Label { |
| let idx = self.count; |
| self.count += 1; |
| Label { idx: idx } |
| } |
| } |
| |
| /// Represents a register file |
| #[repr(u8)] |
| #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] |
| pub enum RegFile { |
| /// The general-purpose register file |
| /// |
| /// General-purpose registers are 32 bits per SIMT channel. |
| GPR = 0, |
| |
| /// The general-purpose uniform register file |
| /// |
| /// General-purpose uniform registers are 32 bits each and uniform across a |
| /// wave. |
| UGPR = 1, |
| |
| /// The predicate reigster file |
| /// |
| /// Predicate registers are 1 bit per SIMT channel. |
| Pred = 2, |
| |
| /// The uniform predicate reigster file |
| /// |
| /// Uniform predicate registers are 1 bit and uniform across a wave. |
| UPred = 3, |
| |
| /// The carry flag register file |
| /// |
| /// Only one carry flag register exists in hardware, but representing it as |
| /// a reg file simplifies dependency tracking. |
| /// |
| /// This is used only on SM50. |
| Carry = 4, |
| |
| /// The barrier register file |
| /// |
| /// This is a lane mask used for wave re-convergence instructions. |
| Bar = 5, |
| |
| /// The memory register file |
| /// |
| /// This is a virtual register file for things which will get spilled to |
| /// local memory. Each memory location is 32 bits per SIMT channel. |
| Mem = 6, |
| } |
| |
| const NUM_REG_FILES: usize = 7; |
| |
| impl RegFile { |
| /// Returns true if the register file is uniform across a wave. |
| pub fn is_uniform(&self) -> bool { |
| match self { |
| RegFile::GPR |
| | RegFile::Pred |
| | RegFile::Carry |
| | RegFile::Bar |
| | RegFile::Mem => false, |
| RegFile::UGPR | RegFile::UPred => true, |
| } |
| } |
| |
| /// Returns the uniform form of this register file, if any. For `GPR` and |
| /// `UGPR, this returns `UGPR` and for `Pred` and `UPred`, this returns |
| /// `UPred`. |
| pub fn to_uniform(self) -> Option<RegFile> { |
| match self { |
| RegFile::GPR | RegFile::UGPR => Some(RegFile::UGPR), |
| RegFile::Pred | RegFile::UPred => Some(RegFile::UPred), |
| RegFile::Carry | RegFile::Bar | RegFile::Mem => None, |
| } |
| } |
| |
| /// Returns warp-wide version of this register file. |
| pub fn to_warp(self) -> RegFile { |
| match self { |
| RegFile::GPR | RegFile::UGPR => RegFile::GPR, |
| RegFile::Pred | RegFile::UPred => RegFile::Pred, |
| RegFile::Carry | RegFile::Bar | RegFile::Mem => self, |
| } |
| } |
| |
| /// Returns true if the register file is GPR or UGPR. |
| pub fn is_gpr(&self) -> bool { |
| match self { |
| RegFile::GPR | RegFile::UGPR => true, |
| RegFile::Pred |
| | RegFile::UPred |
| | RegFile::Carry |
| | RegFile::Bar |
| | RegFile::Mem => false, |
| } |
| } |
| |
| /// Returns true if the register file is a predicate register file. |
| pub fn is_predicate(&self) -> bool { |
| match self { |
| RegFile::GPR |
| | RegFile::UGPR |
| | RegFile::Carry |
| | RegFile::Bar |
| | RegFile::Mem => false, |
| RegFile::Pred | RegFile::UPred => true, |
| } |
| } |
| |
| pub fn fmt_prefix(&self) -> &'static str { |
| match self { |
| RegFile::GPR => "r", |
| RegFile::UGPR => "ur", |
| RegFile::Pred => "p", |
| RegFile::UPred => "up", |
| RegFile::Carry => "c", |
| RegFile::Bar => "b", |
| RegFile::Mem => "m", |
| } |
| } |
| } |
| |
| impl fmt::Display for RegFile { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| RegFile::GPR => write!(f, "GPR"), |
| RegFile::UGPR => write!(f, "UGPR"), |
| RegFile::Pred => write!(f, "Pred"), |
| RegFile::UPred => write!(f, "UPred"), |
| RegFile::Carry => write!(f, "Carry"), |
| RegFile::Bar => write!(f, "Bar"), |
| RegFile::Mem => write!(f, "Mem"), |
| } |
| } |
| } |
| |
| impl From<RegFile> for u8 { |
| fn from(value: RegFile) -> u8 { |
| value as u8 |
| } |
| } |
| |
| impl TryFrom<u32> for RegFile { |
| type Error = &'static str; |
| |
| fn try_from(value: u32) -> Result<Self, Self::Error> { |
| match value { |
| 0 => Ok(RegFile::GPR), |
| 1 => Ok(RegFile::UGPR), |
| 2 => Ok(RegFile::Pred), |
| 3 => Ok(RegFile::UPred), |
| 4 => Ok(RegFile::Carry), |
| 5 => Ok(RegFile::Bar), |
| 6 => Ok(RegFile::Mem), |
| _ => Err("Invalid register file number"), |
| } |
| } |
| } |
| |
| impl TryFrom<u16> for RegFile { |
| type Error = &'static str; |
| |
| fn try_from(value: u16) -> Result<Self, Self::Error> { |
| RegFile::try_from(u32::from(value)) |
| } |
| } |
| |
| impl TryFrom<u8> for RegFile { |
| type Error = &'static str; |
| |
| fn try_from(value: u8) -> Result<Self, Self::Error> { |
| RegFile::try_from(u32::from(value)) |
| } |
| } |
| |
| /// A trait for things which have an associated register file |
| pub trait HasRegFile { |
| fn file(&self) -> RegFile; |
| |
| fn is_uniform(&self) -> bool { |
| self.file().is_uniform() |
| } |
| |
| fn is_gpr(&self) -> bool { |
| self.file().is_gpr() |
| } |
| |
| fn is_predicate(&self) -> bool { |
| self.file().is_predicate() |
| } |
| } |
| |
| impl HasRegFile for &[SSAValue] { |
| fn file(&self) -> RegFile { |
| let comps = self.len(); |
| let file = self[0].file(); |
| for i in 1..comps { |
| if self[i].file() != file { |
| panic!("Illegal mix of RegFiles") |
| } |
| } |
| file |
| } |
| } |
| |
| #[derive(Clone)] |
| pub struct RegFileSet { |
| bits: u8, |
| } |
| |
| impl RegFileSet { |
| pub fn new() -> RegFileSet { |
| RegFileSet { bits: 0 } |
| } |
| |
| pub fn len(&self) -> usize { |
| self.bits.count_ones() as usize |
| } |
| |
| pub fn contains(&self, file: RegFile) -> bool { |
| self.bits & (1 << (file as u8)) != 0 |
| } |
| |
| pub fn insert(&mut self, file: RegFile) -> bool { |
| let has_file = self.contains(file); |
| self.bits |= 1 << (file as u8); |
| !has_file |
| } |
| |
| pub fn is_empty(&self) -> bool { |
| self.bits == 0 |
| } |
| |
| #[allow(dead_code)] |
| pub fn iter(&self) -> RegFileSet { |
| self.clone() |
| } |
| |
| pub fn remove(&mut self, file: RegFile) -> bool { |
| let has_file = self.contains(file); |
| self.bits &= !(1 << (file as u8)); |
| has_file |
| } |
| } |
| |
| impl FromIterator<RegFile> for RegFileSet { |
| fn from_iter<T: IntoIterator<Item = RegFile>>(iter: T) -> Self { |
| let mut set = RegFileSet::new(); |
| for file in iter { |
| set.insert(file); |
| } |
| set |
| } |
| } |
| |
| impl Iterator for RegFileSet { |
| type Item = RegFile; |
| |
| fn next(&mut self) -> Option<RegFile> { |
| if self.is_empty() { |
| None |
| } else { |
| let file = self.bits.trailing_zeros().try_into().unwrap(); |
| self.remove(file); |
| Some(file) |
| } |
| } |
| |
| fn size_hint(&self) -> (usize, Option<usize>) { |
| let len = self.len(); |
| (len, Some(len)) |
| } |
| } |
| |
| /// A container mapping register files to items. |
| /// |
| /// This is used by several passes which need to replicate a data structure |
| /// per-register-file. |
| #[derive(Clone, Copy)] |
| pub struct PerRegFile<T> { |
| per_file: [T; NUM_REG_FILES], |
| } |
| |
| impl<T> PerRegFile<T> { |
| /// Creates a new per-register-file container. |
| /// |
| /// Because this container assumes it always has an item for each register |
| /// file, it takes a callback which maps register files to initial values |
| /// to avoid adding a bunch of `Option<T>` or requiring `T` to implement |
| /// `Default`. If `T` does implement `Default`, then so does |
| /// `PerRefFile<T>`. |
| pub fn new_with<F: Fn(RegFile) -> T>(f: F) -> Self { |
| PerRegFile { |
| per_file: [ |
| f(RegFile::GPR), |
| f(RegFile::UGPR), |
| f(RegFile::Pred), |
| f(RegFile::UPred), |
| f(RegFile::Carry), |
| f(RegFile::Bar), |
| f(RegFile::Mem), |
| ], |
| } |
| } |
| |
| /// Iterates over the values in this container. |
| pub fn values(&self) -> slice::Iter<'_, T> { |
| self.per_file.iter() |
| } |
| |
| /// Iterates over the mutable values in this container. |
| pub fn values_mut(&mut self) -> slice::IterMut<'_, T> { |
| self.per_file.iter_mut() |
| } |
| } |
| |
| impl<T: Default> Default for PerRegFile<T> { |
| fn default() -> Self { |
| PerRegFile { |
| per_file: Default::default(), |
| } |
| } |
| } |
| |
| impl<T> Index<RegFile> for PerRegFile<T> { |
| type Output = T; |
| |
| fn index(&self, idx: RegFile) -> &T { |
| &self.per_file[idx as u8 as usize] |
| } |
| } |
| |
| impl<T> IndexMut<RegFile> for PerRegFile<T> { |
| fn index_mut(&mut self, idx: RegFile) -> &mut T { |
| &mut self.per_file[idx as u8 as usize] |
| } |
| } |
| |
| /// A reference to a contiguous range of registers in a particular register |
| /// file. |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub struct RegRef { |
| packed: u32, |
| } |
| |
| impl RegRef { |
| pub const MAX_IDX: u32 = (1 << 26) - 1; |
| |
| /// Creates a new register reference. |
| /// |
| /// # Panics |
| /// |
| /// This method panics if `base_idx > RegRef::MAX_IDX` or if `comps > 8`. |
| pub fn new(file: RegFile, base_idx: u32, comps: u8) -> RegRef { |
| assert!(base_idx <= Self::MAX_IDX); |
| let mut packed = base_idx; |
| assert!(comps > 0 && comps <= 8); |
| packed |= u32::from(comps - 1) << 26; |
| assert!(u8::from(file) < 8); |
| packed |= u32::from(u8::from(file)) << 29; |
| RegRef { packed: packed } |
| } |
| |
| /// Returns the index of the first register referenced. |
| pub fn base_idx(&self) -> u32 { |
| self.packed & 0x03ffffff |
| } |
| |
| /// Returns the range of register indices referenced. |
| pub fn idx_range(&self) -> Range<u32> { |
| let start = self.base_idx(); |
| let end = start + u32::from(self.comps()); |
| start..end |
| } |
| |
| /// Returns the number of registers referenced. |
| pub fn comps(&self) -> u8 { |
| (((self.packed >> 26) & 0x7) + 1).try_into().unwrap() |
| } |
| |
| /// Returns a reference to the single register at `base_idx() + c`. |
| pub fn comp(&self, c: u8) -> RegRef { |
| assert!(c < self.comps()); |
| RegRef::new(self.file(), self.base_idx() + u32::from(c), 1) |
| } |
| } |
| |
| impl HasRegFile for RegRef { |
| fn file(&self) -> RegFile { |
| ((self.packed >> 29) & 0x7).try_into().unwrap() |
| } |
| } |
| |
| impl fmt::Display for RegRef { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "{}{}", self.file().fmt_prefix(), self.base_idx())?; |
| if self.comps() > 1 { |
| write!(f, "..{}", self.idx_range().end)?; |
| } |
| Ok(()) |
| } |
| } |
| |
| #[derive(Clone)] |
| pub enum Dst { |
| None, |
| SSA(SSARef), |
| Reg(RegRef), |
| } |
| |
| impl Dst { |
| pub fn is_none(&self) -> bool { |
| matches!(self, Dst::None) |
| } |
| |
| pub fn as_reg(&self) -> Option<&RegRef> { |
| match self { |
| Dst::Reg(r) => Some(r), |
| _ => None, |
| } |
| } |
| |
| pub fn as_ssa(&self) -> Option<&SSARef> { |
| match self { |
| Dst::SSA(r) => Some(r), |
| _ => None, |
| } |
| } |
| |
| #[allow(dead_code)] |
| pub fn to_ssa(self) -> SSARef { |
| match self { |
| Dst::SSA(r) => r, |
| _ => panic!("Expected ssa"), |
| } |
| } |
| |
| pub fn iter_ssa(&self) -> slice::Iter<'_, SSAValue> { |
| match self { |
| Dst::None | Dst::Reg(_) => &[], |
| Dst::SSA(ssa) => ssa.deref(), |
| } |
| .iter() |
| } |
| |
| pub fn iter_ssa_mut(&mut self) -> slice::IterMut<'_, SSAValue> { |
| match self { |
| Dst::None | Dst::Reg(_) => &mut [], |
| Dst::SSA(ssa) => ssa.deref_mut(), |
| } |
| .iter_mut() |
| } |
| } |
| |
| impl From<RegRef> for Dst { |
| fn from(reg: RegRef) -> Dst { |
| Dst::Reg(reg) |
| } |
| } |
| |
| impl<T: Into<SSARef>> From<T> for Dst { |
| fn from(ssa: T) -> Dst { |
| Dst::SSA(ssa.into()) |
| } |
| } |
| |
| impl From<Option<SSAValue>> for Dst { |
| fn from(ssa: Option<SSAValue>) -> Dst { |
| match ssa { |
| Some(ssa) => Dst::SSA(ssa.into()), |
| None => Dst::None, |
| } |
| } |
| } |
| |
| impl fmt::Display for Dst { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| Dst::None => write!(f, "null")?, |
| Dst::SSA(v) => v.fmt(f)?, |
| Dst::Reg(r) => r.fmt(f)?, |
| } |
| Ok(()) |
| } |
| } |
| |
| #[derive(Clone, Eq, Hash, PartialEq)] |
| pub enum CBuf { |
| Binding(u8), |
| |
| #[allow(dead_code)] |
| BindlessSSA([SSAValue; 2]), |
| |
| #[allow(dead_code)] |
| BindlessUGPR(RegRef), |
| } |
| |
| impl fmt::Display for CBuf { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| CBuf::Binding(idx) => write!(f, "c[{:#x}]", idx), |
| CBuf::BindlessSSA(v) => write!(f, "cx[{{{}, {}}}]", v[0], v[1]), |
| CBuf::BindlessUGPR(r) => write!(f, "cx[{}]", r), |
| } |
| } |
| } |
| |
| #[derive(Clone, Eq, Hash, PartialEq)] |
| pub struct CBufRef { |
| pub buf: CBuf, |
| pub offset: u16, |
| } |
| |
| impl CBufRef { |
| pub fn offset(self, offset: u16) -> CBufRef { |
| CBufRef { |
| buf: self.buf, |
| offset: self.offset + offset, |
| } |
| } |
| } |
| |
| impl fmt::Display for CBufRef { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "{}[{:#x}]", self.buf, self.offset) |
| } |
| } |
| |
| #[derive(Clone, Eq, Hash, PartialEq)] |
| pub enum SrcRef { |
| Zero, |
| True, |
| False, |
| Imm32(u32), |
| CBuf(CBufRef), |
| SSA(SSARef), |
| Reg(RegRef), |
| } |
| |
| impl SrcRef { |
| #[allow(dead_code)] |
| pub fn is_alu(&self) -> bool { |
| match self { |
| SrcRef::Zero | SrcRef::Imm32(_) | SrcRef::CBuf(_) => true, |
| SrcRef::SSA(ssa) => ssa.is_gpr(), |
| SrcRef::Reg(reg) => reg.is_gpr(), |
| SrcRef::True | SrcRef::False => false, |
| } |
| } |
| |
| pub fn is_bindless_cbuf(&self) -> bool { |
| match self { |
| SrcRef::CBuf(cbuf) => { |
| matches!(cbuf.buf, CBuf::BindlessSSA(_) | CBuf::BindlessUGPR(_)) |
| } |
| _ => false, |
| } |
| } |
| |
| pub fn is_predicate(&self) -> bool { |
| match self { |
| SrcRef::Zero | SrcRef::Imm32(_) | SrcRef::CBuf(_) => false, |
| SrcRef::True | SrcRef::False => true, |
| SrcRef::SSA(ssa) => ssa.is_predicate(), |
| SrcRef::Reg(reg) => reg.is_predicate(), |
| } |
| } |
| |
| pub fn is_carry(&self) -> bool { |
| match self { |
| SrcRef::SSA(ssa) => ssa.file() == RegFile::Carry, |
| SrcRef::Reg(reg) => reg.file() == RegFile::Carry, |
| _ => false, |
| } |
| } |
| |
| #[allow(dead_code)] |
| pub fn is_barrier(&self) -> bool { |
| match self { |
| SrcRef::SSA(ssa) => ssa.file() == RegFile::Bar, |
| SrcRef::Reg(reg) => reg.file() == RegFile::Bar, |
| _ => false, |
| } |
| } |
| |
| pub fn as_reg(&self) -> Option<&RegRef> { |
| match self { |
| SrcRef::Reg(r) => Some(r), |
| _ => None, |
| } |
| } |
| |
| pub fn as_ssa(&self) -> Option<&SSARef> { |
| match self { |
| SrcRef::SSA(r) => Some(r), |
| _ => None, |
| } |
| } |
| |
| pub fn to_ssa(self) -> SSARef { |
| match self { |
| SrcRef::SSA(r) => r, |
| _ => panic!(), |
| } |
| } |
| |
| pub fn as_u32(&self) -> Option<u32> { |
| match self { |
| SrcRef::Zero => Some(0), |
| SrcRef::Imm32(u) => Some(*u), |
| SrcRef::CBuf(_) | SrcRef::SSA(_) | SrcRef::Reg(_) => None, |
| _ => panic!("Invalid integer source"), |
| } |
| } |
| |
| pub fn get_reg(&self) -> Option<&RegRef> { |
| match self { |
| SrcRef::Zero |
| | SrcRef::True |
| | SrcRef::False |
| | SrcRef::Imm32(_) |
| | SrcRef::SSA(_) => None, |
| SrcRef::CBuf(cb) => match &cb.buf { |
| CBuf::Binding(_) | CBuf::BindlessSSA(_) => None, |
| CBuf::BindlessUGPR(reg) => Some(reg), |
| }, |
| SrcRef::Reg(reg) => Some(reg), |
| } |
| } |
| |
| pub fn iter_ssa(&self) -> slice::Iter<'_, SSAValue> { |
| match self { |
| SrcRef::Zero |
| | SrcRef::True |
| | SrcRef::False |
| | SrcRef::Imm32(_) |
| | SrcRef::Reg(_) => &[], |
| SrcRef::CBuf(cb) => match &cb.buf { |
| CBuf::Binding(_) | CBuf::BindlessUGPR(_) => &[], |
| CBuf::BindlessSSA(ssa) => &ssa[..], |
| }, |
| SrcRef::SSA(ssa) => ssa.deref(), |
| } |
| .iter() |
| } |
| |
| pub fn iter_ssa_mut(&mut self) -> slice::IterMut<'_, SSAValue> { |
| match self { |
| SrcRef::Zero |
| | SrcRef::True |
| | SrcRef::False |
| | SrcRef::Imm32(_) |
| | SrcRef::Reg(_) => &mut [], |
| SrcRef::CBuf(cb) => match &mut cb.buf { |
| CBuf::Binding(_) | CBuf::BindlessUGPR(_) => &mut [], |
| CBuf::BindlessSSA(ssa) => &mut ssa[..], |
| }, |
| SrcRef::SSA(ssa) => ssa.deref_mut(), |
| } |
| .iter_mut() |
| } |
| } |
| |
| impl From<bool> for SrcRef { |
| fn from(b: bool) -> SrcRef { |
| if b { |
| SrcRef::True |
| } else { |
| SrcRef::False |
| } |
| } |
| } |
| |
| impl From<u32> for SrcRef { |
| fn from(u: u32) -> SrcRef { |
| if u == 0 { |
| SrcRef::Zero |
| } else { |
| SrcRef::Imm32(u) |
| } |
| } |
| } |
| |
| impl From<f32> for SrcRef { |
| fn from(f: f32) -> SrcRef { |
| f.to_bits().into() |
| } |
| } |
| |
| impl From<PrmtSel> for SrcRef { |
| fn from(sel: PrmtSel) -> SrcRef { |
| u32::from(sel.0).into() |
| } |
| } |
| |
| impl From<CBufRef> for SrcRef { |
| fn from(cb: CBufRef) -> SrcRef { |
| SrcRef::CBuf(cb) |
| } |
| } |
| |
| impl From<RegRef> for SrcRef { |
| fn from(reg: RegRef) -> SrcRef { |
| SrcRef::Reg(reg) |
| } |
| } |
| |
| impl<T: Into<SSARef>> From<T> for SrcRef { |
| fn from(ssa: T) -> SrcRef { |
| SrcRef::SSA(ssa.into()) |
| } |
| } |
| |
| impl From<PredRef> for SrcRef { |
| fn from(value: PredRef) -> Self { |
| match value { |
| PredRef::None => SrcRef::True, |
| PredRef::Reg(reg) => SrcRef::Reg(reg), |
| PredRef::SSA(ssa) => SrcRef::SSA(ssa.into()), |
| } |
| } |
| } |
| |
| impl fmt::Display for SrcRef { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| SrcRef::Zero => write!(f, "rZ"), |
| SrcRef::True => write!(f, "pT"), |
| SrcRef::False => write!(f, "pF"), |
| SrcRef::Imm32(u) => write!(f, "{:#x}", u), |
| SrcRef::CBuf(c) => c.fmt(f), |
| SrcRef::SSA(v) => v.fmt(f), |
| SrcRef::Reg(r) => r.fmt(f), |
| } |
| } |
| } |
| |
| #[derive(Clone, Copy, PartialEq)] |
| pub enum SrcMod { |
| None, |
| FAbs, |
| FNeg, |
| FNegAbs, |
| INeg, |
| BNot, |
| } |
| |
| impl SrcMod { |
| pub fn is_none(&self) -> bool { |
| matches!(self, SrcMod::None) |
| } |
| |
| pub fn has_fabs(&self) -> bool { |
| match self { |
| SrcMod::None | SrcMod::FNeg => false, |
| SrcMod::FAbs | SrcMod::FNegAbs => true, |
| _ => panic!("Not a float modifier"), |
| } |
| } |
| |
| pub fn has_fneg(&self) -> bool { |
| match self { |
| SrcMod::None | SrcMod::FAbs => false, |
| SrcMod::FNeg | SrcMod::FNegAbs => true, |
| _ => panic!("Not a float modifier"), |
| } |
| } |
| |
| pub fn is_ineg(&self) -> bool { |
| match self { |
| SrcMod::None => false, |
| SrcMod::INeg => true, |
| _ => panic!("Not an integer modifier"), |
| } |
| } |
| |
| pub fn is_bnot(&self) -> bool { |
| match self { |
| SrcMod::None => false, |
| SrcMod::BNot => true, |
| _ => panic!("Not a bitwise modifier"), |
| } |
| } |
| |
| pub fn fabs(self) -> SrcMod { |
| match self { |
| SrcMod::None | SrcMod::FAbs | SrcMod::FNeg | SrcMod::FNegAbs => { |
| SrcMod::FAbs |
| } |
| _ => panic!("Not a float source modifier"), |
| } |
| } |
| |
| pub fn fneg(self) -> SrcMod { |
| match self { |
| SrcMod::None => SrcMod::FNeg, |
| SrcMod::FAbs => SrcMod::FNegAbs, |
| SrcMod::FNeg => SrcMod::None, |
| SrcMod::FNegAbs => SrcMod::FAbs, |
| _ => panic!("Not a float source modifier"), |
| } |
| } |
| |
| pub fn ineg(self) -> SrcMod { |
| match self { |
| SrcMod::None => SrcMod::INeg, |
| SrcMod::INeg => SrcMod::None, |
| _ => panic!("Not an integer source modifier"), |
| } |
| } |
| |
| pub fn bnot(self) -> SrcMod { |
| match self { |
| SrcMod::None => SrcMod::BNot, |
| SrcMod::BNot => SrcMod::None, |
| _ => panic!("Not a boolean source modifier"), |
| } |
| } |
| |
| pub fn modify(self, other: SrcMod) -> SrcMod { |
| match other { |
| SrcMod::None => self, |
| SrcMod::FAbs => self.fabs(), |
| SrcMod::FNeg => self.fneg(), |
| SrcMod::FNegAbs => self.fabs().fneg(), |
| SrcMod::INeg => self.ineg(), |
| SrcMod::BNot => self.bnot(), |
| } |
| } |
| } |
| |
| #[derive(Clone, Copy, PartialEq)] |
| #[allow(dead_code)] |
| pub enum SrcSwizzle { |
| None, |
| Xx, |
| Yy, |
| } |
| |
| impl SrcSwizzle { |
| pub fn is_none(&self) -> bool { |
| matches!(self, SrcSwizzle::None) |
| } |
| } |
| |
| impl fmt::Display for SrcSwizzle { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| SrcSwizzle::None => Ok(()), |
| SrcSwizzle::Xx => write!(f, ".xx"), |
| SrcSwizzle::Yy => write!(f, ".yy"), |
| } |
| } |
| } |
| |
| #[derive(Clone, PartialEq)] |
| pub struct Src { |
| pub src_ref: SrcRef, |
| pub src_mod: SrcMod, |
| pub src_swizzle: SrcSwizzle, |
| } |
| |
| impl Src { |
| pub const ZERO: Src = Src { |
| src_ref: SrcRef::Zero, |
| src_mod: SrcMod::None, |
| src_swizzle: SrcSwizzle::None, |
| }; |
| |
| pub fn new_imm_u32(u: u32) -> Src { |
| u.into() |
| } |
| |
| pub fn new_imm_bool(b: bool) -> Src { |
| b.into() |
| } |
| |
| pub fn is_unmodified(&self) -> bool { |
| self.src_mod.is_none() && self.src_swizzle.is_none() |
| } |
| |
| pub fn fabs(self) -> Src { |
| Src { |
| src_ref: self.src_ref, |
| src_mod: self.src_mod.fabs(), |
| src_swizzle: self.src_swizzle, |
| } |
| } |
| |
| pub fn fneg(self) -> Src { |
| Src { |
| src_ref: self.src_ref, |
| src_mod: self.src_mod.fneg(), |
| src_swizzle: self.src_swizzle, |
| } |
| } |
| |
| pub fn ineg(self) -> Src { |
| Src { |
| src_ref: self.src_ref, |
| src_mod: self.src_mod.ineg(), |
| src_swizzle: self.src_swizzle, |
| } |
| } |
| |
| pub fn bnot(self) -> Src { |
| Src { |
| src_ref: self.src_ref, |
| src_mod: self.src_mod.bnot(), |
| src_swizzle: self.src_swizzle, |
| } |
| } |
| |
| pub fn modify(mut self, src_mod: SrcMod) -> Src { |
| self.src_mod = self.src_mod.modify(src_mod); |
| self |
| } |
| |
| pub fn as_u32(&self, src_type: SrcType) -> Option<u32> { |
| let u = match &self.src_ref { |
| SrcRef::Zero => 0, |
| SrcRef::Imm32(u) => *u, |
| _ => return None, |
| }; |
| |
| if self.is_unmodified() { |
| return Some(u); |
| } |
| |
| assert!(src_type == SrcType::F16v2 || self.src_swizzle.is_none()); |
| |
| // INeg affects more than just the 32 bits of input data so it can't be |
| // trivially folded. In fact, -imm may not be representable as a 32-bit |
| // immediate at all. |
| if src_type == SrcType::I32 { |
| return None; |
| } |
| |
| Some(match src_type { |
| SrcType::F16 => { |
| let low = u & 0xFFFF; |
| |
| match self.src_mod { |
| SrcMod::None => low, |
| SrcMod::FAbs => low & !(1_u32 << 15), |
| SrcMod::FNeg => low ^ (1_u32 << 15), |
| SrcMod::FNegAbs => low | (1_u32 << 15), |
| _ => panic!("Not a float source modifier"), |
| } |
| } |
| SrcType::F16v2 => { |
| let u = match self.src_swizzle { |
| SrcSwizzle::None => u, |
| SrcSwizzle::Xx => (u << 16) | (u & 0xffff), |
| SrcSwizzle::Yy => (u & 0xffff0000) | (u >> 16), |
| }; |
| |
| match self.src_mod { |
| SrcMod::None => u, |
| SrcMod::FAbs => u & 0x7FFF7FFF, |
| SrcMod::FNeg => u ^ 0x80008000, |
| SrcMod::FNegAbs => u | 0x80008000, |
| _ => panic!("Not a float source modifier"), |
| } |
| } |
| SrcType::F32 | SrcType::F64 => match self.src_mod { |
| SrcMod::None => u, |
| SrcMod::FAbs => u & !(1_u32 << 31), |
| SrcMod::FNeg => u ^ (1_u32 << 31), |
| SrcMod::FNegAbs => u | (1_u32 << 31), |
| _ => panic!("Not a float source modifier"), |
| }, |
| SrcType::I32 => match self.src_mod { |
| SrcMod::None => u, |
| SrcMod::INeg => -(u as i32) as u32, |
| _ => panic!("Not an integer source modifier"), |
| }, |
| SrcType::B32 => match self.src_mod { |
| SrcMod::None => u, |
| SrcMod::BNot => !u, |
| _ => panic!("Not a bitwise source modifier"), |
| }, |
| _ => { |
| assert!(self.is_unmodified()); |
| u |
| } |
| }) |
| } |
| |
| pub fn as_ssa(&self) -> Option<&SSARef> { |
| if self.is_unmodified() { |
| self.src_ref.as_ssa() |
| } else { |
| None |
| } |
| } |
| |
| pub fn to_ssa(self) -> SSARef { |
| if self.is_unmodified() { |
| self.src_ref.to_ssa() |
| } else { |
| panic!("Did not expect src_mod"); |
| } |
| } |
| |
| pub fn as_bool(&self) -> Option<bool> { |
| match &self.src_ref { |
| SrcRef::True => Some(!self.src_mod.is_bnot()), |
| SrcRef::False => Some(self.src_mod.is_bnot()), |
| SrcRef::SSA(vec) => { |
| assert!(vec.is_predicate() && vec.comps() == 1); |
| None |
| } |
| SrcRef::Reg(reg) => { |
| assert!(reg.is_predicate() && reg.comps() == 1); |
| None |
| } |
| _ => panic!("Not a boolean source"), |
| } |
| } |
| |
| pub fn as_imm_not_i20(&self) -> Option<u32> { |
| match self.src_ref { |
| SrcRef::Imm32(i) => { |
| assert!(self.is_unmodified()); |
| let top = i & 0xfff80000; |
| if top == 0 || top == 0xfff80000 { |
| None |
| } else { |
| Some(i) |
| } |
| } |
| _ => None, |
| } |
| } |
| |
| pub fn as_imm_not_f20(&self) -> Option<u32> { |
| match self.src_ref { |
| SrcRef::Imm32(i) => { |
| assert!(self.is_unmodified()); |
| if (i & 0xfff) == 0 { |
| None |
| } else { |
| Some(i) |
| } |
| } |
| _ => None, |
| } |
| } |
| |
| pub fn iter_ssa(&self) -> slice::Iter<'_, SSAValue> { |
| self.src_ref.iter_ssa() |
| } |
| |
| pub fn iter_ssa_mut(&mut self) -> slice::IterMut<'_, SSAValue> { |
| self.src_ref.iter_ssa_mut() |
| } |
| |
| pub fn is_uniform(&self) -> bool { |
| match &self.src_ref { |
| SrcRef::Zero |
| | SrcRef::True |
| | SrcRef::False |
| | SrcRef::Imm32(_) |
| | SrcRef::CBuf(_) => true, |
| SrcRef::SSA(ssa) => ssa.is_uniform(), |
| SrcRef::Reg(reg) => reg.is_uniform(), |
| } |
| } |
| |
| pub fn is_bindless_cbuf(&self) -> bool { |
| self.src_ref.is_bindless_cbuf() |
| } |
| |
| pub fn is_upred_reg(&self) -> bool { |
| match &self.src_ref { |
| SrcRef::SSA(ssa) => ssa.file() == RegFile::UPred, |
| SrcRef::Reg(reg) => reg.file() == RegFile::UPred, |
| _ => false, |
| } |
| } |
| |
| pub fn is_predicate(&self) -> bool { |
| self.src_ref.is_predicate() |
| } |
| |
| pub fn is_zero(&self) -> bool { |
| match self.src_ref { |
| SrcRef::Zero | SrcRef::Imm32(0) => match self.src_mod { |
| SrcMod::None | SrcMod::FAbs => true, |
| SrcMod::FNeg | SrcMod::FNegAbs | SrcMod::BNot => false, |
| // INeg affects more than just the 32 bits of input data so -0 |
| // may not be equivalent to 0. |
| SrcMod::INeg => false, |
| }, |
| _ => false, |
| } |
| } |
| |
| pub fn is_nonzero(&self) -> bool { |
| assert!(self.is_unmodified()); |
| matches!(self.src_ref, SrcRef::Imm32(x) if x != 0) |
| } |
| |
| pub fn is_true(&self) -> bool { |
| self.as_bool() == Some(true) |
| } |
| |
| pub fn is_fneg_zero(&self, src_type: SrcType) -> bool { |
| match self.as_u32(src_type) { |
| Some(0x00008000) => src_type == SrcType::F16, |
| Some(0x80000000) => { |
| src_type == SrcType::F32 || src_type == SrcType::F64 |
| } |
| Some(0x80008000) => src_type == SrcType::F16v2, |
| _ => false, |
| } |
| } |
| |
| #[allow(dead_code)] |
| pub fn supports_type(&self, src_type: &SrcType) -> bool { |
| match src_type { |
| SrcType::SSA => { |
| if !self.is_unmodified() { |
| return false; |
| } |
| |
| matches!(self.src_ref, SrcRef::SSA(_) | SrcRef::Reg(_)) |
| } |
| SrcType::GPR => { |
| if !self.is_unmodified() { |
| return false; |
| } |
| |
| matches!( |
| self.src_ref, |
| SrcRef::Zero | SrcRef::SSA(_) | SrcRef::Reg(_) |
| ) |
| } |
| SrcType::ALU => self.is_unmodified() && self.src_ref.is_alu(), |
| SrcType::F16 | SrcType::F32 | SrcType::F64 | SrcType::F16v2 => { |
| match self.src_mod { |
| SrcMod::None |
| | SrcMod::FAbs |
| | SrcMod::FNeg |
| | SrcMod::FNegAbs => (), |
| _ => return false, |
| } |
| |
| self.src_ref.is_alu() |
| } |
| SrcType::I32 => { |
| match self.src_mod { |
| SrcMod::None | SrcMod::INeg => (), |
| _ => return false, |
| } |
| |
| self.src_ref.is_alu() |
| } |
| SrcType::B32 => { |
| match self.src_mod { |
| SrcMod::None | SrcMod::BNot => (), |
| _ => return false, |
| } |
| |
| self.src_ref.is_alu() |
| } |
| SrcType::Pred => { |
| match self.src_mod { |
| SrcMod::None | SrcMod::BNot => (), |
| _ => return false, |
| } |
| |
| self.src_ref.is_predicate() |
| } |
| SrcType::Carry => self.is_unmodified() && self.src_ref.is_carry(), |
| SrcType::Bar => self.is_unmodified() && self.src_ref.is_barrier(), |
| } |
| } |
| } |
| |
| impl<T: Into<SrcRef>> From<T> for Src { |
| fn from(value: T) -> Src { |
| Src { |
| src_ref: value.into(), |
| src_mod: SrcMod::None, |
| src_swizzle: SrcSwizzle::None, |
| } |
| } |
| } |
| |
| impl From<Pred> for Src { |
| fn from(value: Pred) -> Self { |
| Src { |
| src_ref: value.pred_ref.into(), |
| src_mod: if value.pred_inv { |
| SrcMod::BNot |
| } else { |
| SrcMod::None |
| }, |
| src_swizzle: SrcSwizzle::None, |
| } |
| } |
| } |
| |
| impl fmt::Display for Src { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self.src_mod { |
| SrcMod::None => write!(f, "{}{}", self.src_ref, self.src_swizzle), |
| SrcMod::FAbs => write!(f, "|{}{}|", self.src_ref, self.src_swizzle), |
| SrcMod::FNeg => write!(f, "-{}{}", self.src_ref, self.src_swizzle), |
| SrcMod::FNegAbs => { |
| write!(f, "-|{}{}|", self.src_ref, self.src_swizzle) |
| } |
| SrcMod::INeg => write!(f, "-{}{}", self.src_ref, self.src_swizzle), |
| SrcMod::BNot => write!(f, "!{}{}", self.src_ref, self.src_swizzle), |
| } |
| } |
| } |
| |
| #[repr(u8)] |
| #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] |
| pub enum SrcType { |
| SSA, |
| GPR, |
| ALU, |
| F16, |
| F16v2, |
| F32, |
| F64, |
| I32, |
| B32, |
| Pred, |
| Carry, |
| Bar, |
| } |
| |
| impl SrcType { |
| const DEFAULT: SrcType = SrcType::GPR; |
| } |
| |
| pub type SrcTypeList = AttrList<SrcType>; |
| |
| pub trait SrcsAsSlice: AsSlice<Src, Attr = SrcType> { |
| fn srcs_as_slice(&self) -> &[Src] { |
| self.as_slice() |
| } |
| |
| fn srcs_as_mut_slice(&mut self) -> &mut [Src] { |
| self.as_mut_slice() |
| } |
| |
| fn src_types(&self) -> SrcTypeList { |
| self.attrs() |
| } |
| |
| fn src_idx(&self, src: &Src) -> usize { |
| let r = self.srcs_as_slice().as_ptr_range(); |
| assert!(r.contains(&(src as *const Src))); |
| unsafe { (src as *const Src).offset_from(r.start) as usize } |
| } |
| } |
| |
| impl<T: AsSlice<Src, Attr = SrcType>> SrcsAsSlice for T {} |
| |
| fn all_dsts_uniform(dsts: &[Dst]) -> bool { |
| let mut uniform = None; |
| for dst in dsts { |
| let dst_uniform = match dst { |
| Dst::None => continue, |
| Dst::Reg(r) => r.is_uniform(), |
| Dst::SSA(r) => r.file().is_uniform(), |
| }; |
| assert!(uniform.is_none() || uniform == Some(dst_uniform)); |
| uniform = Some(dst_uniform); |
| } |
| uniform == Some(true) |
| } |
| |
| #[repr(u8)] |
| #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] |
| pub enum DstType { |
| Pred, |
| GPR, |
| F16, |
| F16v2, |
| F32, |
| F64, |
| Carry, |
| Bar, |
| Vec, |
| } |
| |
| impl DstType { |
| const DEFAULT: DstType = DstType::Vec; |
| } |
| |
| pub type DstTypeList = AttrList<DstType>; |
| |
| pub trait DstsAsSlice: AsSlice<Dst, Attr = DstType> { |
| fn dsts_as_slice(&self) -> &[Dst] { |
| self.as_slice() |
| } |
| |
| fn dsts_as_mut_slice(&mut self) -> &mut [Dst] { |
| self.as_mut_slice() |
| } |
| |
| // Currently only used by test code |
| #[allow(dead_code)] |
| fn dst_types(&self) -> DstTypeList { |
| self.attrs() |
| } |
| |
| fn dst_idx(&self, dst: &Dst) -> usize { |
| let r = self.dsts_as_slice().as_ptr_range(); |
| assert!(r.contains(&(dst as *const Dst))); |
| unsafe { (dst as *const Dst).offset_from(r.start) as usize } |
| } |
| } |
| |
| impl<T: AsSlice<Dst, Attr = DstType>> DstsAsSlice for T {} |
| |
| pub trait IsUniform { |
| fn is_uniform(&self) -> bool; |
| } |
| |
| impl<T: DstsAsSlice> IsUniform for T { |
| fn is_uniform(&self) -> bool { |
| all_dsts_uniform(self.dsts_as_slice()) |
| } |
| } |
| |
| fn fmt_dst_slice(f: &mut fmt::Formatter<'_>, dsts: &[Dst]) -> fmt::Result { |
| if dsts.is_empty() { |
| return Ok(()); |
| } |
| |
| // Figure out the last non-null dst |
| // |
| // Note: By making the top inclusive and starting at 0, we ensure that |
| // at least one dst always gets printed. |
| let mut last_dst = 0; |
| for (i, dst) in dsts.iter().enumerate() { |
| if !dst.is_none() { |
| last_dst = i; |
| } |
| } |
| |
| for i in 0..(last_dst + 1) { |
| if i != 0 { |
| write!(f, " ")?; |
| } |
| write!(f, "{}", &dsts[i])?; |
| } |
| Ok(()) |
| } |
| |
| #[allow(dead_code)] |
| #[derive(Clone, Copy)] |
| pub enum FoldData { |
| Pred(bool), |
| Carry(bool), |
| U32(u32), |
| Vec2([u32; 2]), |
| } |
| |
| pub struct OpFoldData<'a> { |
| pub dsts: &'a mut [FoldData], |
| pub srcs: &'a [FoldData], |
| } |
| |
| impl OpFoldData<'_> { |
| #[allow(dead_code)] |
| pub fn get_pred_src(&self, op: &impl SrcsAsSlice, src: &Src) -> bool { |
| let i = op.src_idx(src); |
| let b = match src.src_ref { |
| SrcRef::Zero | SrcRef::Imm32(_) => panic!("Expected a predicate"), |
| SrcRef::True => true, |
| SrcRef::False => false, |
| _ => { |
| if let FoldData::Pred(b) = self.srcs[i] { |
| b |
| } else { |
| panic!("FoldData is not a predicate"); |
| } |
| } |
| }; |
| b ^ src.src_mod.is_bnot() |
| } |
| |
| pub fn get_u32_src(&self, op: &impl SrcsAsSlice, src: &Src) -> u32 { |
| let i = op.src_idx(src); |
| match src.src_ref { |
| SrcRef::Zero => 0, |
| SrcRef::Imm32(imm) => imm, |
| SrcRef::True | SrcRef::False => panic!("Unexpected predicate"), |
| _ => { |
| if let FoldData::U32(u) = self.srcs[i] { |
| u |
| } else { |
| panic!("FoldData is not a U32"); |
| } |
| } |
| } |
| } |
| |
| #[allow(dead_code)] |
| pub fn get_u32_bnot_src(&self, op: &impl SrcsAsSlice, src: &Src) -> u32 { |
| let x = self.get_u32_src(op, src); |
| if src.src_mod.is_bnot() { |
| !x |
| } else { |
| x |
| } |
| } |
| |
| #[allow(dead_code)] |
| pub fn get_carry_src(&self, op: &impl SrcsAsSlice, src: &Src) -> bool { |
| assert!(src.src_ref.as_ssa().is_some()); |
| let i = op.src_idx(src); |
| if let FoldData::Carry(b) = self.srcs[i] { |
| b |
| } else { |
| panic!("FoldData is not a predicate"); |
| } |
| } |
| |
| #[allow(dead_code)] |
| pub fn get_f32_src(&self, op: &impl SrcsAsSlice, src: &Src) -> f32 { |
| f32::from_bits(self.get_u32_src(op, src)) |
| } |
| |
| #[allow(dead_code)] |
| pub fn get_f64_src(&self, op: &impl SrcsAsSlice, src: &Src) -> f64 { |
| let i = op.src_idx(src); |
| match src.src_ref { |
| SrcRef::Zero => 0.0, |
| SrcRef::Imm32(imm) => f64::from_bits(u64::from(imm) << 32), |
| SrcRef::True | SrcRef::False => panic!("Unexpected predicate"), |
| _ => { |
| if let FoldData::Vec2(v) = self.srcs[i] { |
| let u = u64::from(v[0]) | (u64::from(v[1]) << 32); |
| f64::from_bits(u) |
| } else { |
| panic!("FoldData is not a U32"); |
| } |
| } |
| } |
| } |
| |
| #[allow(dead_code)] |
| pub fn set_pred_dst(&mut self, op: &impl DstsAsSlice, dst: &Dst, b: bool) { |
| self.dsts[op.dst_idx(dst)] = FoldData::Pred(b); |
| } |
| |
| #[allow(dead_code)] |
| pub fn set_carry_dst(&mut self, op: &impl DstsAsSlice, dst: &Dst, b: bool) { |
| self.dsts[op.dst_idx(dst)] = FoldData::Carry(b); |
| } |
| |
| pub fn set_u32_dst(&mut self, op: &impl DstsAsSlice, dst: &Dst, u: u32) { |
| self.dsts[op.dst_idx(dst)] = FoldData::U32(u); |
| } |
| |
| #[allow(dead_code)] |
| pub fn set_f32_dst(&mut self, op: &impl DstsAsSlice, dst: &Dst, f: f32) { |
| self.set_u32_dst(op, dst, f.to_bits()); |
| } |
| |
| #[allow(dead_code)] |
| pub fn set_f64_dst(&mut self, op: &impl DstsAsSlice, dst: &Dst, f: f64) { |
| let u = f.to_bits(); |
| let v = [u as u32, (u >> 32) as u32]; |
| self.dsts[op.dst_idx(dst)] = FoldData::Vec2(v); |
| } |
| } |
| |
| pub trait Foldable: SrcsAsSlice + DstsAsSlice { |
| // Currently only used by test code |
| #[allow(dead_code)] |
| fn fold(&self, sm: &dyn ShaderModel, f: &mut OpFoldData<'_>); |
| } |
| |
| pub trait DisplayOp: DstsAsSlice { |
| fn fmt_dsts(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| fmt_dst_slice(f, self.dsts_as_slice()) |
| } |
| |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result; |
| } |
| |
| // Hack struct so we can re-use Formatters. Shamelessly stolen from |
| // https://users.rust-lang.org/t/reusing-an-fmt-formatter/8531/4 |
| pub struct Fmt<F>(pub F) |
| where |
| F: Fn(&mut fmt::Formatter) -> fmt::Result; |
| |
| impl<F> fmt::Display for Fmt<F> |
| where |
| F: Fn(&mut fmt::Formatter) -> fmt::Result, |
| { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| (self.0)(f) |
| } |
| } |
| |
| macro_rules! impl_display_for_op { |
| ($op: ident) => { |
| impl fmt::Display for $op { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let mut s = String::new(); |
| write!(s, "{}", Fmt(|f| self.fmt_dsts(f)))?; |
| if !s.is_empty() { |
| write!(f, "{} = ", s)?; |
| } |
| self.fmt_op(f) |
| } |
| } |
| }; |
| } |
| |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub enum PredSetOp { |
| And, |
| Or, |
| Xor, |
| } |
| |
| impl PredSetOp { |
| #[allow(dead_code)] |
| pub fn eval(&self, a: bool, b: bool) -> bool { |
| match self { |
| PredSetOp::And => a & b, |
| PredSetOp::Or => a | b, |
| PredSetOp::Xor => a ^ b, |
| } |
| } |
| |
| pub fn is_trivial(&self, accum: &Src) -> bool { |
| if let Some(b) = accum.as_bool() { |
| match self { |
| PredSetOp::And => b, |
| PredSetOp::Or => !b, |
| PredSetOp::Xor => !b, |
| } |
| } else { |
| false |
| } |
| } |
| } |
| |
| impl fmt::Display for PredSetOp { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| PredSetOp::And => write!(f, ".and"), |
| PredSetOp::Or => write!(f, ".or"), |
| PredSetOp::Xor => write!(f, ".xor"), |
| } |
| } |
| } |
| |
| #[allow(dead_code)] |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub enum FloatCmpOp { |
| OrdEq, |
| OrdNe, |
| OrdLt, |
| OrdLe, |
| OrdGt, |
| OrdGe, |
| UnordEq, |
| UnordNe, |
| UnordLt, |
| UnordLe, |
| UnordGt, |
| UnordGe, |
| IsNum, |
| IsNan, |
| } |
| |
| impl FloatCmpOp { |
| pub fn flip(self) -> FloatCmpOp { |
| match self { |
| FloatCmpOp::OrdEq | FloatCmpOp::OrdNe => self, |
| FloatCmpOp::OrdLt => FloatCmpOp::OrdGt, |
| FloatCmpOp::OrdLe => FloatCmpOp::OrdGe, |
| FloatCmpOp::OrdGt => FloatCmpOp::OrdLt, |
| FloatCmpOp::OrdGe => FloatCmpOp::OrdLe, |
| FloatCmpOp::UnordEq | FloatCmpOp::UnordNe => self, |
| FloatCmpOp::UnordLt => FloatCmpOp::UnordGt, |
| FloatCmpOp::UnordLe => FloatCmpOp::UnordGe, |
| FloatCmpOp::UnordGt => FloatCmpOp::UnordLt, |
| FloatCmpOp::UnordGe => FloatCmpOp::UnordLe, |
| FloatCmpOp::IsNum | FloatCmpOp::IsNan => panic!("Cannot flip unop"), |
| } |
| } |
| } |
| |
| impl fmt::Display for FloatCmpOp { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| FloatCmpOp::OrdEq => write!(f, ".eq"), |
| FloatCmpOp::OrdNe => write!(f, ".ne"), |
| FloatCmpOp::OrdLt => write!(f, ".lt"), |
| FloatCmpOp::OrdLe => write!(f, ".le"), |
| FloatCmpOp::OrdGt => write!(f, ".gt"), |
| FloatCmpOp::OrdGe => write!(f, ".ge"), |
| FloatCmpOp::UnordEq => write!(f, ".equ"), |
| FloatCmpOp::UnordNe => write!(f, ".neu"), |
| FloatCmpOp::UnordLt => write!(f, ".ltu"), |
| FloatCmpOp::UnordLe => write!(f, ".leu"), |
| FloatCmpOp::UnordGt => write!(f, ".gtu"), |
| FloatCmpOp::UnordGe => write!(f, ".geu"), |
| FloatCmpOp::IsNum => write!(f, ".num"), |
| FloatCmpOp::IsNan => write!(f, ".nan"), |
| } |
| } |
| } |
| |
| #[allow(dead_code)] |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub enum IntCmpOp { |
| False, |
| True, |
| Eq, |
| Ne, |
| Lt, |
| Le, |
| Gt, |
| Ge, |
| } |
| |
| impl IntCmpOp { |
| pub fn flip(self) -> IntCmpOp { |
| match self { |
| IntCmpOp::False | IntCmpOp::True => self, |
| IntCmpOp::Eq | IntCmpOp::Ne => self, |
| IntCmpOp::Lt => IntCmpOp::Gt, |
| IntCmpOp::Le => IntCmpOp::Ge, |
| IntCmpOp::Gt => IntCmpOp::Lt, |
| IntCmpOp::Ge => IntCmpOp::Le, |
| } |
| } |
| } |
| |
| impl fmt::Display for IntCmpOp { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| IntCmpOp::False => write!(f, ".f"), |
| IntCmpOp::True => write!(f, ".t"), |
| IntCmpOp::Eq => write!(f, ".eq"), |
| IntCmpOp::Ne => write!(f, ".ne"), |
| IntCmpOp::Lt => write!(f, ".lt"), |
| IntCmpOp::Le => write!(f, ".le"), |
| IntCmpOp::Gt => write!(f, ".gt"), |
| IntCmpOp::Ge => write!(f, ".ge"), |
| } |
| } |
| } |
| |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub enum IntCmpType { |
| U32, |
| I32, |
| } |
| |
| impl IntCmpType { |
| #[allow(dead_code)] |
| pub fn is_signed(&self) -> bool { |
| match self { |
| IntCmpType::U32 => false, |
| IntCmpType::I32 => true, |
| } |
| } |
| } |
| |
| impl fmt::Display for IntCmpType { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| IntCmpType::U32 => write!(f, ".u32"), |
| IntCmpType::I32 => write!(f, ".i32"), |
| } |
| } |
| } |
| |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub enum LogicOp2 { |
| And, |
| Or, |
| Xor, |
| PassB, |
| } |
| |
| impl fmt::Display for LogicOp2 { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| LogicOp2::And => write!(f, "and"), |
| LogicOp2::Or => write!(f, "or"), |
| LogicOp2::Xor => write!(f, "xor"), |
| LogicOp2::PassB => write!(f, "pass_b"), |
| } |
| } |
| } |
| |
| impl LogicOp2 { |
| pub fn to_lut(self) -> LogicOp3 { |
| match self { |
| LogicOp2::And => LogicOp3::new_lut(&|x, y, _| x & y), |
| LogicOp2::Or => LogicOp3::new_lut(&|x, y, _| x | y), |
| LogicOp2::Xor => LogicOp3::new_lut(&|x, y, _| x ^ y), |
| LogicOp2::PassB => LogicOp3::new_lut(&|_, b, _| b), |
| } |
| } |
| } |
| |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub struct LogicOp3 { |
| pub lut: u8, |
| } |
| |
| impl LogicOp3 { |
| pub const SRC_MASKS: [u8; 3] = [0xf0, 0xcc, 0xaa]; |
| |
| #[inline] |
| pub fn new_lut<F: Fn(u8, u8, u8) -> u8>(f: &F) -> LogicOp3 { |
| LogicOp3 { |
| lut: f( |
| LogicOp3::SRC_MASKS[0], |
| LogicOp3::SRC_MASKS[1], |
| LogicOp3::SRC_MASKS[2], |
| ), |
| } |
| } |
| |
| pub fn new_const(val: bool) -> LogicOp3 { |
| LogicOp3 { |
| lut: if val { !0 } else { 0 }, |
| } |
| } |
| |
| pub fn src_used(&self, src_idx: usize) -> bool { |
| let mask = LogicOp3::SRC_MASKS[src_idx]; |
| let shift = LogicOp3::SRC_MASKS[src_idx].trailing_zeros(); |
| self.lut & !mask != (self.lut >> shift) & !mask |
| } |
| |
| pub fn fix_src(&mut self, src_idx: usize, val: bool) { |
| let mask = LogicOp3::SRC_MASKS[src_idx]; |
| let shift = LogicOp3::SRC_MASKS[src_idx].trailing_zeros(); |
| if val { |
| let t_bits = self.lut & mask; |
| self.lut = t_bits | (t_bits >> shift) |
| } else { |
| let f_bits = self.lut & !mask; |
| self.lut = (f_bits << shift) | f_bits |
| }; |
| } |
| |
| pub fn invert_src(&mut self, src_idx: usize) { |
| let mask = LogicOp3::SRC_MASKS[src_idx]; |
| let shift = LogicOp3::SRC_MASKS[src_idx].trailing_zeros(); |
| let t_bits = self.lut & mask; |
| let f_bits = self.lut & !mask; |
| self.lut = (f_bits << shift) | (t_bits >> shift); |
| } |
| |
| pub fn eval< |
| T: BitAnd<Output = T> + BitOr<Output = T> + Copy + Not<Output = T>, |
| >( |
| &self, |
| x: T, |
| y: T, |
| z: T, |
| ) -> T { |
| let mut res = x & !x; // zero |
| if (self.lut & (1 << 0)) != 0 { |
| res = res | (!x & !y & !z); |
| } |
| if (self.lut & (1 << 1)) != 0 { |
| res = res | (!x & !y & z); |
| } |
| if (self.lut & (1 << 2)) != 0 { |
| res = res | (!x & y & !z); |
| } |
| if (self.lut & (1 << 3)) != 0 { |
| res = res | (!x & y & z); |
| } |
| if (self.lut & (1 << 4)) != 0 { |
| res = res | (x & !y & !z); |
| } |
| if (self.lut & (1 << 5)) != 0 { |
| res = res | (x & !y & z); |
| } |
| if (self.lut & (1 << 6)) != 0 { |
| res = res | (x & y & !z); |
| } |
| if (self.lut & (1 << 7)) != 0 { |
| res = res | (x & y & z); |
| } |
| res |
| } |
| } |
| |
| impl fmt::Display for LogicOp3 { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "LUT[{:#x}]", self.lut) |
| } |
| } |
| |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub enum FloatType { |
| F16, |
| F32, |
| F64, |
| } |
| |
| impl FloatType { |
| pub fn from_bits(bytes: usize) -> FloatType { |
| match bytes { |
| 16 => FloatType::F16, |
| 32 => FloatType::F32, |
| 64 => FloatType::F64, |
| _ => panic!("Invalid float type size"), |
| } |
| } |
| |
| pub fn bits(&self) -> usize { |
| match self { |
| FloatType::F16 => 16, |
| FloatType::F32 => 32, |
| FloatType::F64 => 64, |
| } |
| } |
| } |
| |
| impl fmt::Display for FloatType { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| FloatType::F16 => write!(f, ".f16"), |
| FloatType::F32 => write!(f, ".f32"), |
| FloatType::F64 => write!(f, ".f64"), |
| } |
| } |
| } |
| |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub enum FRndMode { |
| NearestEven, |
| NegInf, |
| PosInf, |
| Zero, |
| } |
| |
| impl fmt::Display for FRndMode { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| FRndMode::NearestEven => write!(f, ".re"), |
| FRndMode::NegInf => write!(f, ".rm"), |
| FRndMode::PosInf => write!(f, ".rp"), |
| FRndMode::Zero => write!(f, ".rz"), |
| } |
| } |
| } |
| |
| #[derive(Clone, Copy, Eq, PartialEq)] |
| pub struct TexCBufRef { |
| pub idx: u8, |
| pub offset: u16, |
| } |
| |
| #[allow(dead_code)] |
| #[derive(Clone, Copy, Eq, PartialEq)] |
| pub enum TexRef { |
| Bound(u16), |
| CBuf(TexCBufRef), |
| Bindless, |
| } |
| |
| impl fmt::Display for TexRef { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| TexRef::Bound(idx) => write!(f, "tex[{idx}]"), |
| TexRef::CBuf(TexCBufRef { idx, offset }) => { |
| write!(f, "c[{idx:#x}][{offset:#x}]") |
| } |
| TexRef::Bindless => write!(f, "bindless"), |
| } |
| } |
| } |
| |
| #[derive(Clone, Copy, Eq, PartialEq)] |
| pub enum TexDim { |
| _1D, |
| Array1D, |
| _2D, |
| Array2D, |
| _3D, |
| Cube, |
| ArrayCube, |
| } |
| |
| impl fmt::Display for TexDim { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| TexDim::_1D => write!(f, ".1d"), |
| TexDim::Array1D => write!(f, ".a1d"), |
| TexDim::_2D => write!(f, ".2d"), |
| TexDim::Array2D => write!(f, ".a2d"), |
| TexDim::_3D => write!(f, ".3d"), |
| TexDim::Cube => write!(f, ".cube"), |
| TexDim::ArrayCube => write!(f, ".acube"), |
| } |
| } |
| } |
| |
| #[derive(Clone, Copy, Eq, PartialEq)] |
| pub enum TexLodMode { |
| Auto, |
| Zero, |
| Bias, |
| Lod, |
| Clamp, |
| BiasClamp, |
| } |
| |
| impl TexLodMode { |
| pub fn is_explicit_lod(&self) -> bool { |
| match self { |
| TexLodMode::Auto |
| | TexLodMode::Bias |
| | TexLodMode::Clamp |
| | TexLodMode::BiasClamp => false, |
| TexLodMode::Zero | TexLodMode::Lod => true, |
| } |
| } |
| } |
| |
| impl fmt::Display for TexLodMode { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| TexLodMode::Auto => write!(f, ""), |
| TexLodMode::Zero => write!(f, ".lz"), |
| TexLodMode::Bias => write!(f, ".lb"), |
| TexLodMode::Lod => write!(f, ".ll"), |
| TexLodMode::Clamp => write!(f, ".lc"), |
| TexLodMode::BiasClamp => write!(f, ".lb.lc"), |
| } |
| } |
| } |
| |
| /// Derivative behavior for tex ops and FSwzAdd |
| /// |
| /// The descriptions here may not be wholly accurate as they come from cobbling |
| /// together a bunch of pieces. This is my (Faith's) best understanding of how |
| /// these things work. |
| #[allow(dead_code)] |
| #[derive(Clone, Copy, Eq, PartialEq)] |
| pub enum TexDerivMode { |
| /// Automatic |
| /// |
| /// For partial (not full) quads, the derivative will default to the value |
| /// of DEFAULT_PARTIAL in SET_SHADER_CONTROL. |
| /// |
| /// On Volta and earlier GPUs or on Blackwell B and later, derivatives in |
| /// all non-fragment shaders stages are assumed to be partial. |
| Auto, |
| |
| /// Assume a non-divergent (full) derivative |
| /// |
| /// Partial derivative checks are skipped and the hardware does the |
| /// derivative anyway, possibly on rubbish data. |
| NonDivergent, |
| |
| /// Force the derivative to be considered divergent (partial) |
| /// |
| /// This only exists as a separate thing on Blackwell A. On Hopper and |
| /// earlier, there is a .fdv that's part of the LodMode, but only for |
| /// LodMode::Clamp. On Blackwell B, it appears (according to the |
| /// disassembler) to be removed again in favor of DerivXY. |
| ForceDivergent, |
| |
| /// Attempt an X/Y derivative, ignoring shader stage |
| /// |
| /// This is (I think) identical to Auto except that it ignores the shader |
| /// stage checks. This is new on Blackwell B+. |
| DerivXY, |
| } |
| |
| impl fmt::Display for TexDerivMode { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| TexDerivMode::Auto => Ok(()), |
| TexDerivMode::NonDivergent => write!(f, ".ndv"), |
| TexDerivMode::ForceDivergent => write!(f, ".fdv"), |
| TexDerivMode::DerivXY => write!(f, ".dxy"), |
| } |
| } |
| } |
| |
| #[derive(Clone, Copy, Eq, PartialEq)] |
| pub struct ChannelMask(u8); |
| |
| impl ChannelMask { |
| pub fn new(mask: u8) -> Self { |
| assert!(mask != 0 && (mask & !0xf) == 0); |
| ChannelMask(mask) |
| } |
| |
| pub fn for_comps(comps: u8) -> Self { |
| assert!(comps > 0 && comps <= 4); |
| ChannelMask((1 << comps) - 1) |
| } |
| |
| pub fn to_bits(self) -> u8 { |
| self.0 |
| } |
| } |
| |
| impl fmt::Display for ChannelMask { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, ".")?; |
| for (i, c) in ['r', 'g', 'b', 'a'].into_iter().enumerate() { |
| if self.0 & (1 << i) != 0 { |
| write!(f, "{c}")?; |
| } |
| } |
| Ok(()) |
| } |
| } |
| |
| #[derive(Clone, Copy, Eq, PartialEq)] |
| pub enum TexOffsetMode { |
| None, |
| AddOffI, |
| PerPx, // tld4 only |
| } |
| |
| impl fmt::Display for TexOffsetMode { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| TexOffsetMode::None => write!(f, ""), |
| TexOffsetMode::AddOffI => write!(f, ".aoffi"), |
| TexOffsetMode::PerPx => write!(f, ".ptp"), |
| } |
| } |
| } |
| |
| #[allow(dead_code)] |
| #[derive(Clone, Copy, Eq, PartialEq)] |
| pub enum TexQuery { |
| Dimension, |
| TextureType, |
| SamplerPos, |
| } |
| |
| impl fmt::Display for TexQuery { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| TexQuery::Dimension => write!(f, "dimension"), |
| TexQuery::TextureType => write!(f, "texture_type"), |
| TexQuery::SamplerPos => write!(f, "sampler_pos"), |
| } |
| } |
| } |
| |
| #[derive(Clone, Copy, Eq, PartialEq)] |
| pub enum ImageDim { |
| _1D, |
| _1DBuffer, |
| _1DArray, |
| _2D, |
| _2DArray, |
| _3D, |
| } |
| |
| impl ImageDim { |
| pub fn coord_comps(&self) -> u8 { |
| match self { |
| ImageDim::_1D => 1, |
| ImageDim::_1DBuffer => 1, |
| ImageDim::_1DArray => 2, |
| ImageDim::_2D => 2, |
| ImageDim::_2DArray => 3, |
| ImageDim::_3D => 3, |
| } |
| } |
| } |
| |
| impl fmt::Display for ImageDim { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| ImageDim::_1D => write!(f, ".1d"), |
| ImageDim::_1DBuffer => write!(f, ".buf"), |
| ImageDim::_1DArray => write!(f, ".a1d"), |
| ImageDim::_2D => write!(f, ".2d"), |
| ImageDim::_2DArray => write!(f, ".a2d"), |
| ImageDim::_3D => write!(f, ".3d"), |
| } |
| } |
| } |
| |
| #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] |
| pub enum IntType { |
| U8, |
| I8, |
| U16, |
| I16, |
| U32, |
| I32, |
| U64, |
| I64, |
| } |
| |
| impl IntType { |
| pub fn from_bits(bits: usize, is_signed: bool) -> IntType { |
| match bits { |
| 8 => { |
| if is_signed { |
| IntType::I8 |
| } else { |
| IntType::U8 |
| } |
| } |
| 16 => { |
| if is_signed { |
| IntType::I16 |
| } else { |
| IntType::U16 |
| } |
| } |
| 32 => { |
| if is_signed { |
| IntType::I32 |
| } else { |
| IntType::U32 |
| } |
| } |
| 64 => { |
| if is_signed { |
| IntType::I64 |
| } else { |
| IntType::U64 |
| } |
| } |
| _ => panic!("Invalid integer type size"), |
| } |
| } |
| |
| pub fn is_signed(&self) -> bool { |
| match self { |
| IntType::U8 | IntType::U16 | IntType::U32 | IntType::U64 => false, |
| IntType::I8 | IntType::I16 | IntType::I32 | IntType::I64 => true, |
| } |
| } |
| |
| pub fn bits(&self) -> usize { |
| match self { |
| IntType::U8 | IntType::I8 => 8, |
| IntType::U16 | IntType::I16 => 16, |
| IntType::U32 | IntType::I32 => 32, |
| IntType::U64 | IntType::I64 => 64, |
| } |
| } |
| } |
| |
| impl fmt::Display for IntType { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| IntType::U8 => write!(f, ".u8"), |
| IntType::I8 => write!(f, ".i8"), |
| IntType::U16 => write!(f, ".u16"), |
| IntType::I16 => write!(f, ".i16"), |
| IntType::U32 => write!(f, ".u32"), |
| IntType::I32 => write!(f, ".i32"), |
| IntType::U64 => write!(f, ".u64"), |
| IntType::I64 => write!(f, ".i64"), |
| } |
| } |
| } |
| |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub enum MemAddrType { |
| A32, |
| A64, |
| } |
| |
| impl fmt::Display for MemAddrType { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| MemAddrType::A32 => write!(f, ".a32"), |
| MemAddrType::A64 => write!(f, ".a64"), |
| } |
| } |
| } |
| |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub enum MemType { |
| U8, |
| I8, |
| U16, |
| I16, |
| B32, |
| B64, |
| B128, |
| } |
| |
| impl MemType { |
| pub fn from_size(size: u8, is_signed: bool) -> MemType { |
| match size { |
| 1 => { |
| if is_signed { |
| MemType::I8 |
| } else { |
| MemType::U8 |
| } |
| } |
| 2 => { |
| if is_signed { |
| MemType::I16 |
| } else { |
| MemType::U16 |
| } |
| } |
| 4 => MemType::B32, |
| 8 => MemType::B64, |
| 16 => MemType::B128, |
| _ => panic!("Invalid memory load/store size"), |
| } |
| } |
| |
| #[allow(dead_code)] |
| pub fn bits(&self) -> usize { |
| match self { |
| MemType::U8 | MemType::I8 => 8, |
| MemType::U16 | MemType::I16 => 16, |
| MemType::B32 => 32, |
| MemType::B64 => 64, |
| MemType::B128 => 128, |
| } |
| } |
| } |
| |
| impl fmt::Display for MemType { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| MemType::U8 => write!(f, ".u8"), |
| MemType::I8 => write!(f, ".i8"), |
| MemType::U16 => write!(f, ".u16"), |
| MemType::I16 => write!(f, ".i16"), |
| MemType::B32 => write!(f, ".b32"), |
| MemType::B64 => write!(f, ".b64"), |
| MemType::B128 => write!(f, ".b128"), |
| } |
| } |
| } |
| |
| #[allow(dead_code)] |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub enum MemOrder { |
| Constant, |
| Weak, |
| Strong(MemScope), |
| } |
| |
| impl fmt::Display for MemOrder { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| MemOrder::Constant => write!(f, ".constant"), |
| MemOrder::Weak => write!(f, ".weak"), |
| MemOrder::Strong(scope) => write!(f, ".strong{}", scope), |
| } |
| } |
| } |
| |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub enum MemScope { |
| CTA, |
| GPU, |
| System, |
| } |
| |
| impl fmt::Display for MemScope { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| MemScope::CTA => write!(f, ".cta"), |
| MemScope::GPU => write!(f, ".gpu"), |
| MemScope::System => write!(f, ".sys"), |
| } |
| } |
| } |
| |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub enum MemSpace { |
| Global(MemAddrType), |
| Local, |
| Shared, |
| } |
| |
| impl MemSpace { |
| pub fn addr_type(&self) -> MemAddrType { |
| match self { |
| MemSpace::Global(t) => *t, |
| MemSpace::Local => MemAddrType::A32, |
| MemSpace::Shared => MemAddrType::A32, |
| } |
| } |
| } |
| |
| impl fmt::Display for MemSpace { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| MemSpace::Global(t) => write!(f, ".global{t}"), |
| MemSpace::Local => write!(f, ".local"), |
| MemSpace::Shared => write!(f, ".shared"), |
| } |
| } |
| } |
| |
| #[allow(dead_code)] |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub enum MemEvictionPriority { |
| First, |
| Normal, |
| Last, |
| LastUse, |
| Unchanged, |
| NoAllocate, |
| } |
| |
| impl fmt::Display for MemEvictionPriority { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| MemEvictionPriority::First => write!(f, ".ef"), |
| MemEvictionPriority::Normal => Ok(()), |
| MemEvictionPriority::Last => write!(f, ".el"), |
| MemEvictionPriority::LastUse => write!(f, ".lu"), |
| MemEvictionPriority::Unchanged => write!(f, ".eu"), |
| MemEvictionPriority::NoAllocate => write!(f, ".na"), |
| } |
| } |
| } |
| |
| /// Memory load cache ops used by Kepler |
| #[allow(dead_code)] |
| #[derive(Clone, Copy, Default, Eq, Hash, PartialEq)] |
| pub enum LdCacheOp { |
| #[default] |
| CacheAll, |
| CacheGlobal, |
| /// This cache mode not officially documented by NVIDIA. What we do know is |
| /// that the Cuda C programming gude says: |
| /// |
| /// > The read-only data cache load function is only supported by devices |
| /// > of compute capability 5.0 and higher. |
| /// > ```c |
| /// > T __ldg(const T* address); |
| /// > ``` |
| /// |
| /// and we know that `__ldg()` compiles to `ld.global.nc` in PTX which |
| /// compiles to `ld.ci`. The PTX 5.0 docs say: |
| /// |
| /// > Load register variable `d` from the location specified by the source |
| /// > address operand `a` in the global state space, and optionally cache in |
| /// > non-coherent texture cache. Since the cache is non-coherent, the data |
| /// > should be read-only within the kernel's process. |
| /// |
| /// Since `.nc` means "non-coherent", the name "incoherent" seems about |
| /// right. The quote above also seems to imply that these loads got loaded |
| /// through the texture cache but we don't fully understand the implications |
| /// of that. |
| CacheIncoherent, |
| CacheStreaming, |
| CacheInvalidate, |
| } |
| |
| impl fmt::Display for LdCacheOp { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| LdCacheOp::CacheAll => write!(f, ".ca"), |
| LdCacheOp::CacheGlobal => write!(f, ".cg"), |
| LdCacheOp::CacheIncoherent => write!(f, ".ci"), |
| LdCacheOp::CacheStreaming => write!(f, ".cs"), |
| LdCacheOp::CacheInvalidate => write!(f, ".cv"), |
| } |
| } |
| } |
| |
| impl LdCacheOp { |
| pub fn select( |
| sm: &dyn ShaderModel, |
| space: MemSpace, |
| order: MemOrder, |
| _eviction_priority: MemEvictionPriority, |
| ) -> Self { |
| match space { |
| MemSpace::Global(_) => match order { |
| MemOrder::Constant => { |
| if sm.sm() >= 50 { |
| // This is undocumented in the CUDA docs but NVIDIA uses |
| // it for constant loads. |
| LdCacheOp::CacheIncoherent |
| } else { |
| LdCacheOp::CacheAll |
| } |
| } |
| MemOrder::Strong(MemScope::System) => { |
| LdCacheOp::CacheInvalidate |
| } |
| _ => { |
| // From the CUDA 10.2 docs: |
| // |
| // "The default load instruction cache operation is |
| // ld.ca, which allocates cache lines in all levels (L1 |
| // and L2) with normal eviction policy. Global data is |
| // coherent at the L2 level, but multiple L1 caches are |
| // not coherent for global data. If one thread stores to |
| // global memory via one L1 cache, and a second thread |
| // loads that address via a second L1 cache with ld.ca, |
| // the second thread may get stale L1 cache data" |
| // |
| // and |
| // |
| // "L1 caching in Kepler GPUs is reserved only for local |
| // memory accesses, such as register spills and stack |
| // data. Global loads are cached in L2 only (or in the |
| // Read-Only Data Cache)." |
| // |
| // We follow suit and use CacheGlobal for all global memory |
| // access on Kepler. On Maxwell, it appears safe to use |
| // CacheAll for everything. |
| if sm.sm() >= 50 { |
| LdCacheOp::CacheAll |
| } else { |
| LdCacheOp::CacheGlobal |
| } |
| } |
| }, |
| MemSpace::Local | MemSpace::Shared => LdCacheOp::CacheAll, |
| } |
| } |
| } |
| |
| /// Memory store cache ops used by Kepler |
| #[allow(dead_code)] |
| #[derive(Clone, Copy, Default, Eq, Hash, PartialEq)] |
| pub enum StCacheOp { |
| #[default] |
| WriteBack, |
| CacheGlobal, |
| CacheStreaming, |
| WriteThrough, |
| } |
| |
| impl fmt::Display for StCacheOp { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| StCacheOp::WriteBack => write!(f, ".wb"), |
| StCacheOp::CacheGlobal => write!(f, ".cg"), |
| StCacheOp::CacheStreaming => write!(f, ".cs"), |
| StCacheOp::WriteThrough => write!(f, ".wt"), |
| } |
| } |
| } |
| |
| impl StCacheOp { |
| pub fn select( |
| sm: &dyn ShaderModel, |
| space: MemSpace, |
| order: MemOrder, |
| _eviction_priority: MemEvictionPriority, |
| ) -> Self { |
| match space { |
| MemSpace::Global(_) => match order { |
| MemOrder::Constant => panic!("Cannot store to constant"), |
| MemOrder::Strong(MemScope::System) => StCacheOp::WriteThrough, |
| _ => { |
| // See the corresponding comment in LdCacheOp::select() |
| if sm.sm() >= 50 { |
| StCacheOp::WriteBack |
| } else { |
| StCacheOp::CacheGlobal |
| } |
| } |
| }, |
| MemSpace::Local | MemSpace::Shared => StCacheOp::WriteBack, |
| } |
| } |
| } |
| |
| #[derive(Clone)] |
| pub struct MemAccess { |
| pub mem_type: MemType, |
| pub space: MemSpace, |
| pub order: MemOrder, |
| pub eviction_priority: MemEvictionPriority, |
| } |
| |
| impl fmt::Display for MemAccess { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "{}{}{}{}", |
| self.space, self.order, self.eviction_priority, self.mem_type, |
| ) |
| } |
| } |
| |
| impl MemAccess { |
| pub fn ld_cache_op(&self, sm: &dyn ShaderModel) -> LdCacheOp { |
| LdCacheOp::select(sm, self.space, self.order, self.eviction_priority) |
| } |
| |
| pub fn st_cache_op(&self, sm: &dyn ShaderModel) -> StCacheOp { |
| StCacheOp::select(sm, self.space, self.order, self.eviction_priority) |
| } |
| } |
| |
| #[allow(dead_code)] |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub enum AtomType { |
| F16x2, |
| U32, |
| I32, |
| F32, |
| U64, |
| I64, |
| F64, |
| } |
| |
| impl AtomType { |
| pub fn F(bits: u8) -> AtomType { |
| match bits { |
| 16 => panic!("16-bit float atomics not yet supported"), |
| 32 => AtomType::F32, |
| 64 => AtomType::F64, |
| _ => panic!("Invalid float atomic type"), |
| } |
| } |
| |
| pub fn U(bits: u8) -> AtomType { |
| match bits { |
| 32 => AtomType::U32, |
| 64 => AtomType::U64, |
| _ => panic!("Invalid uint atomic type"), |
| } |
| } |
| |
| pub fn I(bits: u8) -> AtomType { |
| match bits { |
| 32 => AtomType::I32, |
| 64 => AtomType::I64, |
| _ => panic!("Invalid int atomic type"), |
| } |
| } |
| |
| pub fn bits(&self) -> usize { |
| match self { |
| AtomType::F16x2 | AtomType::F32 => 32, |
| AtomType::U32 | AtomType::I32 => 32, |
| AtomType::U64 | AtomType::I64 | AtomType::F64 => 64, |
| } |
| } |
| |
| pub fn is_float(&self) -> bool { |
| match self { |
| AtomType::F16x2 | AtomType::F32 | AtomType::F64 => true, |
| AtomType::U32 | AtomType::I32 | AtomType::U64 | AtomType::I64 => { |
| false |
| } |
| } |
| } |
| } |
| |
| impl fmt::Display for AtomType { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| AtomType::F16x2 => write!(f, ".f16x2"), |
| AtomType::U32 => write!(f, ".u32"), |
| AtomType::I32 => write!(f, ".i32"), |
| AtomType::F32 => write!(f, ".f32"), |
| AtomType::U64 => write!(f, ".u64"), |
| AtomType::I64 => write!(f, ".i64"), |
| AtomType::F64 => write!(f, ".f64"), |
| } |
| } |
| } |
| |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub enum AtomCmpSrc { |
| /// The cmpr value is passed as a separate source |
| Separate, |
| /// The cmpr value is packed in with the data with cmpr coming first |
| Packed, |
| } |
| |
| #[allow(dead_code)] |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub enum AtomOp { |
| Add, |
| Min, |
| Max, |
| Inc, |
| Dec, |
| And, |
| Or, |
| Xor, |
| Exch, |
| CmpExch(AtomCmpSrc), |
| } |
| |
| impl AtomOp { |
| pub fn is_reduction(&self) -> bool { |
| match self { |
| AtomOp::Add |
| | AtomOp::Min |
| | AtomOp::Max |
| | AtomOp::Inc |
| | AtomOp::Dec |
| | AtomOp::And |
| | AtomOp::Or |
| | AtomOp::Xor => true, |
| AtomOp::Exch | AtomOp::CmpExch(_) => false, |
| } |
| } |
| } |
| |
| impl fmt::Display for AtomOp { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| AtomOp::Add => write!(f, ".add"), |
| AtomOp::Min => write!(f, ".min"), |
| AtomOp::Max => write!(f, ".max"), |
| AtomOp::Inc => write!(f, ".inc"), |
| AtomOp::Dec => write!(f, ".dec"), |
| AtomOp::And => write!(f, ".and"), |
| AtomOp::Or => write!(f, ".or"), |
| AtomOp::Xor => write!(f, ".xor"), |
| AtomOp::Exch => write!(f, ".exch"), |
| AtomOp::CmpExch(AtomCmpSrc::Separate) => write!(f, ".cmpexch"), |
| AtomOp::CmpExch(AtomCmpSrc::Packed) => write!(f, ".cmpexch.packed"), |
| } |
| } |
| } |
| |
| #[derive(Clone, Copy, Eq, PartialEq)] |
| pub enum InterpFreq { |
| Pass, |
| PassMulW, |
| Constant, |
| State, |
| } |
| |
| impl fmt::Display for InterpFreq { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| InterpFreq::Pass => write!(f, ".pass"), |
| InterpFreq::PassMulW => write!(f, ".pass_mul_w"), |
| InterpFreq::Constant => write!(f, ".constant"), |
| InterpFreq::State => write!(f, ".state"), |
| } |
| } |
| } |
| #[derive(Clone, Copy, Eq, PartialEq)] |
| pub enum InterpLoc { |
| Default, |
| Centroid, |
| Offset, |
| } |
| |
| impl fmt::Display for InterpLoc { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| InterpLoc::Default => Ok(()), |
| InterpLoc::Centroid => write!(f, ".centroid"), |
| InterpLoc::Offset => write!(f, ".offset"), |
| } |
| } |
| } |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpFAdd { |
| #[dst_type(F32)] |
| pub dst: Dst, |
| |
| #[src_type(F32)] |
| pub srcs: [Src; 2], |
| |
| pub saturate: bool, |
| pub rnd_mode: FRndMode, |
| pub ftz: bool, |
| } |
| |
| impl DisplayOp for OpFAdd { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let sat = if self.saturate { ".sat" } else { "" }; |
| write!(f, "fadd{sat}")?; |
| if self.rnd_mode != FRndMode::NearestEven { |
| write!(f, "{}", self.rnd_mode)?; |
| } |
| if self.ftz { |
| write!(f, ".ftz")?; |
| } |
| write!(f, " {} {}", self.srcs[0], self.srcs[1],) |
| } |
| } |
| impl_display_for_op!(OpFAdd); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpFFma { |
| #[dst_type(F32)] |
| pub dst: Dst, |
| |
| #[src_type(F32)] |
| pub srcs: [Src; 3], |
| |
| pub saturate: bool, |
| pub rnd_mode: FRndMode, |
| pub ftz: bool, |
| pub dnz: bool, |
| } |
| |
| impl DisplayOp for OpFFma { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let sat = if self.saturate { ".sat" } else { "" }; |
| write!(f, "ffma{sat}")?; |
| if self.rnd_mode != FRndMode::NearestEven { |
| write!(f, "{}", self.rnd_mode)?; |
| } |
| if self.dnz { |
| write!(f, ".dnz")?; |
| } else if self.ftz { |
| write!(f, ".ftz")?; |
| } |
| write!(f, " {} {} {}", self.srcs[0], self.srcs[1], self.srcs[2]) |
| } |
| } |
| impl_display_for_op!(OpFFma); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpFMnMx { |
| #[dst_type(F32)] |
| pub dst: Dst, |
| |
| #[src_type(F32)] |
| pub srcs: [Src; 2], |
| |
| #[src_type(Pred)] |
| pub min: Src, |
| |
| pub ftz: bool, |
| } |
| |
| impl DisplayOp for OpFMnMx { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let ftz = if self.ftz { ".ftz" } else { "" }; |
| write!( |
| f, |
| "fmnmx{ftz} {} {} {}", |
| self.srcs[0], self.srcs[1], self.min |
| ) |
| } |
| } |
| impl_display_for_op!(OpFMnMx); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpFMul { |
| #[dst_type(F32)] |
| pub dst: Dst, |
| |
| #[src_type(F32)] |
| pub srcs: [Src; 2], |
| |
| pub saturate: bool, |
| pub rnd_mode: FRndMode, |
| pub ftz: bool, |
| pub dnz: bool, |
| } |
| |
| impl DisplayOp for OpFMul { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let sat = if self.saturate { ".sat" } else { "" }; |
| write!(f, "fmul{sat}")?; |
| if self.rnd_mode != FRndMode::NearestEven { |
| write!(f, "{}", self.rnd_mode)?; |
| } |
| if self.dnz { |
| write!(f, ".dnz")?; |
| } else if self.ftz { |
| write!(f, ".ftz")?; |
| } |
| write!(f, " {} {}", self.srcs[0], self.srcs[1],) |
| } |
| } |
| impl_display_for_op!(OpFMul); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpFSet { |
| #[dst_type(F32)] |
| pub dst: Dst, |
| |
| pub cmp_op: FloatCmpOp, |
| |
| #[src_type(F32)] |
| pub srcs: [Src; 2], |
| |
| pub ftz: bool, |
| } |
| |
| impl DisplayOp for OpFSet { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let ftz = if self.ftz { ".ftz" } else { "" }; |
| write!( |
| f, |
| "fset{}{ftz} {} {}", |
| self.cmp_op, self.srcs[0], self.srcs[1] |
| ) |
| } |
| } |
| impl_display_for_op!(OpFSet); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpFSetP { |
| #[dst_type(Pred)] |
| pub dst: Dst, |
| |
| pub set_op: PredSetOp, |
| pub cmp_op: FloatCmpOp, |
| |
| #[src_type(F32)] |
| pub srcs: [Src; 2], |
| |
| #[src_type(Pred)] |
| pub accum: Src, |
| |
| pub ftz: bool, |
| } |
| |
| impl DisplayOp for OpFSetP { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let ftz = if self.ftz { ".ftz" } else { "" }; |
| write!(f, "fsetp{}{ftz}", self.cmp_op)?; |
| if !self.set_op.is_trivial(&self.accum) { |
| write!(f, "{}", self.set_op)?; |
| } |
| write!(f, " {} {}", self.srcs[0], self.srcs[1])?; |
| if !self.set_op.is_trivial(&self.accum) { |
| write!(f, " {}", self.accum)?; |
| } |
| Ok(()) |
| } |
| } |
| impl_display_for_op!(OpFSetP); |
| |
| #[allow(dead_code)] |
| #[derive(Clone, Copy, Eq, PartialEq)] |
| pub enum FSwzAddOp { |
| Add, |
| SubRight, |
| SubLeft, |
| MoveLeft, |
| } |
| |
| impl fmt::Display for FSwzAddOp { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| FSwzAddOp::Add => write!(f, "add"), |
| FSwzAddOp::SubRight => write!(f, "subr"), |
| FSwzAddOp::SubLeft => write!(f, "sub"), |
| FSwzAddOp::MoveLeft => write!(f, "mov2"), |
| } |
| } |
| } |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpFSwzAdd { |
| #[dst_type(F32)] |
| pub dst: Dst, |
| |
| #[src_type(GPR)] |
| pub srcs: [Src; 2], |
| |
| pub rnd_mode: FRndMode, |
| pub ftz: bool, |
| pub deriv_mode: TexDerivMode, |
| |
| pub ops: [FSwzAddOp; 4], |
| } |
| |
| impl DisplayOp for OpFSwzAdd { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "fswzadd",)?; |
| if self.rnd_mode != FRndMode::NearestEven { |
| write!(f, "{}", self.rnd_mode)?; |
| } |
| if self.ftz { |
| write!(f, ".ftz")?; |
| } |
| write!(f, "{}", self.deriv_mode)?; |
| write!( |
| f, |
| " {} {} [{}, {}, {}, {}]", |
| self.srcs[0], |
| self.srcs[1], |
| self.ops[0], |
| self.ops[1], |
| self.ops[2], |
| self.ops[3], |
| ) |
| } |
| } |
| impl_display_for_op!(OpFSwzAdd); |
| |
| /// Describes where the second src is taken before doing the ops |
| #[allow(dead_code)] |
| #[derive(Clone, Copy, Eq, PartialEq)] |
| pub enum FSwzShuffle { |
| Quad0, |
| Quad1, |
| Quad2, |
| Quad3, |
| // swap [0, 1] and [2, 3] |
| SwapHorizontal, |
| // swap [0, 2] and [1, 3] |
| SwapVertical, |
| } |
| |
| impl fmt::Display for FSwzShuffle { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| FSwzShuffle::Quad0 => write!(f, ".0000"), |
| FSwzShuffle::Quad1 => write!(f, ".1111"), |
| FSwzShuffle::Quad2 => write!(f, ".2222"), |
| FSwzShuffle::Quad3 => write!(f, ".3333"), |
| FSwzShuffle::SwapHorizontal => write!(f, ".1032"), |
| FSwzShuffle::SwapVertical => write!(f, ".2301"), |
| } |
| } |
| } |
| |
| /// Op only present in Kepler and older |
| /// It first does a shuffle on the second src and then applies |
| /// src0 op src1, each thread on a quad might do a different operation. |
| /// |
| /// This is used to encode ddx/ddy |
| /// ex: ddx |
| /// src1 = shuffle swap horizontal src1 |
| /// ops = [sub, subr, sub, subr] |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpFSwz { |
| #[dst_type(F32)] |
| pub dst: Dst, |
| |
| #[src_type(GPR)] |
| pub srcs: [Src; 2], |
| |
| pub rnd_mode: FRndMode, |
| pub ftz: bool, |
| pub deriv_mode: TexDerivMode, |
| pub shuffle: FSwzShuffle, |
| |
| pub ops: [FSwzAddOp; 4], |
| } |
| |
| impl DisplayOp for OpFSwz { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "fswz{}", self.shuffle)?; |
| if self.rnd_mode != FRndMode::NearestEven { |
| write!(f, "{}", self.rnd_mode)?; |
| } |
| write!(f, "{}", self.deriv_mode)?; |
| if self.ftz { |
| write!(f, ".ftz")?; |
| } |
| write!( |
| f, |
| " {} {} [{}, {}, {}, {}]", |
| self.srcs[0], |
| self.srcs[1], |
| self.ops[0], |
| self.ops[1], |
| self.ops[2], |
| self.ops[3], |
| ) |
| } |
| } |
| impl_display_for_op!(OpFSwz); |
| |
| pub enum RroOp { |
| SinCos, |
| Exp2, |
| } |
| |
| impl fmt::Display for RroOp { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| RroOp::SinCos => write!(f, ".sincos"), |
| RroOp::Exp2 => write!(f, ".exp2"), |
| } |
| } |
| } |
| |
| /// MuFu range reduction operator |
| /// |
| /// Not available on SM70+ |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpRro { |
| #[dst_type(F32)] |
| pub dst: Dst, |
| |
| pub op: RroOp, |
| |
| #[src_type(F32)] |
| pub src: Src, |
| } |
| |
| impl DisplayOp for OpRro { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "rro{} {}", self.op, self.src) |
| } |
| } |
| impl_display_for_op!(OpRro); |
| |
| #[allow(dead_code)] |
| #[derive(Clone, Copy, Eq, PartialEq)] |
| pub enum MuFuOp { |
| Cos, |
| Sin, |
| Exp2, |
| Log2, |
| Rcp, |
| Rsq, |
| Rcp64H, |
| Rsq64H, |
| Sqrt, |
| Tanh, |
| } |
| |
| impl fmt::Display for MuFuOp { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| MuFuOp::Cos => write!(f, "cos"), |
| MuFuOp::Sin => write!(f, "sin"), |
| MuFuOp::Exp2 => write!(f, "exp2"), |
| MuFuOp::Log2 => write!(f, "log2"), |
| MuFuOp::Rcp => write!(f, "rcp"), |
| MuFuOp::Rsq => write!(f, "rsq"), |
| MuFuOp::Rcp64H => write!(f, "rcp64h"), |
| MuFuOp::Rsq64H => write!(f, "rsq64h"), |
| MuFuOp::Sqrt => write!(f, "sqrt"), |
| MuFuOp::Tanh => write!(f, "tanh"), |
| } |
| } |
| } |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpMuFu { |
| #[dst_type(F32)] |
| pub dst: Dst, |
| |
| pub op: MuFuOp, |
| |
| #[src_type(F32)] |
| pub src: Src, |
| } |
| |
| impl DisplayOp for OpMuFu { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "mufu.{} {}", self.op, self.src) |
| } |
| } |
| impl_display_for_op!(OpMuFu); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpDAdd { |
| #[dst_type(F64)] |
| pub dst: Dst, |
| |
| #[src_type(F64)] |
| pub srcs: [Src; 2], |
| |
| pub rnd_mode: FRndMode, |
| } |
| |
| impl DisplayOp for OpDAdd { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "dadd")?; |
| if self.rnd_mode != FRndMode::NearestEven { |
| write!(f, "{}", self.rnd_mode)?; |
| } |
| write!(f, " {} {}", self.srcs[0], self.srcs[1],) |
| } |
| } |
| impl_display_for_op!(OpDAdd); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpDMul { |
| #[dst_type(F64)] |
| pub dst: Dst, |
| |
| #[src_type(F64)] |
| pub srcs: [Src; 2], |
| |
| pub rnd_mode: FRndMode, |
| } |
| |
| impl DisplayOp for OpDMul { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "dmul")?; |
| if self.rnd_mode != FRndMode::NearestEven { |
| write!(f, "{}", self.rnd_mode)?; |
| } |
| write!(f, " {} {}", self.srcs[0], self.srcs[1],) |
| } |
| } |
| impl_display_for_op!(OpDMul); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpDFma { |
| #[dst_type(F64)] |
| pub dst: Dst, |
| |
| #[src_type(F64)] |
| pub srcs: [Src; 3], |
| |
| pub rnd_mode: FRndMode, |
| } |
| |
| impl DisplayOp for OpDFma { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "dfma")?; |
| if self.rnd_mode != FRndMode::NearestEven { |
| write!(f, "{}", self.rnd_mode)?; |
| } |
| write!(f, " {} {} {}", self.srcs[0], self.srcs[1], self.srcs[2]) |
| } |
| } |
| impl_display_for_op!(OpDFma); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpDMnMx { |
| #[dst_type(F64)] |
| pub dst: Dst, |
| |
| #[src_type(F64)] |
| pub srcs: [Src; 2], |
| |
| #[src_type(Pred)] |
| pub min: Src, |
| } |
| |
| impl DisplayOp for OpDMnMx { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "dmnmx {} {} {}", self.srcs[0], self.srcs[1], self.min) |
| } |
| } |
| impl_display_for_op!(OpDMnMx); |
| |
| #[repr(C)] |
| #[derive(Clone, SrcsAsSlice, DstsAsSlice)] |
| pub struct OpDSetP { |
| #[dst_type(Pred)] |
| pub dst: Dst, |
| |
| pub set_op: PredSetOp, |
| pub cmp_op: FloatCmpOp, |
| |
| #[src_type(F64)] |
| pub srcs: [Src; 2], |
| |
| #[src_type(Pred)] |
| pub accum: Src, |
| } |
| |
| impl Foldable for OpDSetP { |
| fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let a = f.get_f64_src(self, &self.srcs[0]); |
| let b = f.get_f64_src(self, &self.srcs[1]); |
| let accum = f.get_pred_src(self, &self.accum); |
| |
| let ordered = !a.is_nan() && !b.is_nan(); |
| let cmp_res = match self.cmp_op { |
| FloatCmpOp::OrdEq => ordered && a == b, |
| FloatCmpOp::OrdNe => ordered && a != b, |
| FloatCmpOp::OrdLt => ordered && a < b, |
| FloatCmpOp::OrdLe => ordered && a <= b, |
| FloatCmpOp::OrdGt => ordered && a > b, |
| FloatCmpOp::OrdGe => ordered && a >= b, |
| FloatCmpOp::UnordEq => !ordered || a == b, |
| FloatCmpOp::UnordNe => !ordered || a != b, |
| FloatCmpOp::UnordLt => !ordered || a < b, |
| FloatCmpOp::UnordLe => !ordered || a <= b, |
| FloatCmpOp::UnordGt => !ordered || a > b, |
| FloatCmpOp::UnordGe => !ordered || a >= b, |
| FloatCmpOp::IsNum => ordered, |
| FloatCmpOp::IsNan => !ordered, |
| }; |
| let res = self.set_op.eval(cmp_res, accum); |
| |
| f.set_pred_dst(self, &self.dst, res); |
| } |
| } |
| |
| impl DisplayOp for OpDSetP { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "dsetp{}", self.cmp_op)?; |
| if !self.set_op.is_trivial(&self.accum) { |
| write!(f, "{}", self.set_op)?; |
| } |
| write!(f, " {} {}", self.srcs[0], self.srcs[1])?; |
| if !self.set_op.is_trivial(&self.accum) { |
| write!(f, " {}", self.accum)?; |
| } |
| Ok(()) |
| } |
| } |
| impl_display_for_op!(OpDSetP); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpHAdd2 { |
| #[dst_type(F16v2)] |
| pub dst: Dst, |
| |
| #[src_type(F16v2)] |
| pub srcs: [Src; 2], |
| |
| pub saturate: bool, |
| pub ftz: bool, |
| pub f32: bool, |
| } |
| |
| impl DisplayOp for OpHAdd2 { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let sat = if self.saturate { ".sat" } else { "" }; |
| let f32 = if self.f32 { ".f32" } else { "" }; |
| write!(f, "hadd2{sat}{f32}")?; |
| if self.ftz { |
| write!(f, ".ftz")?; |
| } |
| write!(f, " {} {}", self.srcs[0], self.srcs[1]) |
| } |
| } |
| impl_display_for_op!(OpHAdd2); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpHSet2 { |
| #[dst_type(F16v2)] |
| pub dst: Dst, |
| |
| pub set_op: PredSetOp, |
| pub cmp_op: FloatCmpOp, |
| |
| #[src_type(F16v2)] |
| pub srcs: [Src; 2], |
| |
| #[src_type(Pred)] |
| pub accum: Src, |
| |
| pub ftz: bool, |
| } |
| |
| impl DisplayOp for OpHSet2 { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let ftz = if self.ftz { ".ftz" } else { "" }; |
| write!(f, "hset2{}{ftz}", self.cmp_op)?; |
| if !self.set_op.is_trivial(&self.accum) { |
| write!(f, "{}", self.set_op)?; |
| } |
| write!(f, " {} {}", self.srcs[0], self.srcs[1])?; |
| if !self.set_op.is_trivial(&self.accum) { |
| write!(f, " {}", self.accum)?; |
| } |
| Ok(()) |
| } |
| } |
| impl_display_for_op!(OpHSet2); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpHSetP2 { |
| #[dst_type(Pred)] |
| pub dsts: [Dst; 2], |
| |
| pub set_op: PredSetOp, |
| pub cmp_op: FloatCmpOp, |
| |
| #[src_type(F16v2)] |
| pub srcs: [Src; 2], |
| |
| #[src_type(Pred)] |
| pub accum: Src, |
| |
| pub ftz: bool, |
| |
| // When not set, each dsts get the result of each lanes. |
| // When set, the first dst gets the result of both lanes (res0 && res1) |
| // and the second dst gets the negation !(res0 && res1) |
| // before applying the accumulator. |
| pub horizontal: bool, |
| } |
| |
| impl DisplayOp for OpHSetP2 { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let ftz = if self.ftz { ".ftz" } else { "" }; |
| write!(f, "hsetp2{}{ftz}", self.cmp_op)?; |
| if !self.set_op.is_trivial(&self.accum) { |
| write!(f, "{}", self.set_op)?; |
| } |
| write!(f, " {} {}", self.srcs[0], self.srcs[1])?; |
| if !self.set_op.is_trivial(&self.accum) { |
| write!(f, " {}", self.accum)?; |
| } |
| Ok(()) |
| } |
| } |
| impl_display_for_op!(OpHSetP2); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpHMul2 { |
| #[dst_type(F16v2)] |
| pub dst: Dst, |
| |
| #[src_type(F16v2)] |
| pub srcs: [Src; 2], |
| |
| pub saturate: bool, |
| pub ftz: bool, |
| pub dnz: bool, |
| } |
| |
| impl DisplayOp for OpHMul2 { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let sat = if self.saturate { ".sat" } else { "" }; |
| write!(f, "hmul2{sat}")?; |
| if self.dnz { |
| write!(f, ".dnz")?; |
| } else if self.ftz { |
| write!(f, ".ftz")?; |
| } |
| write!(f, " {} {}", self.srcs[0], self.srcs[1]) |
| } |
| } |
| impl_display_for_op!(OpHMul2); |
| |
| #[derive(Clone, Copy, Eq, PartialEq)] |
| #[allow(dead_code)] |
| pub enum ImmaSize { |
| M8N8K16, |
| M8N8K32, |
| M16N8K16, |
| M16N8K32, |
| M16N8K64, |
| } |
| |
| impl fmt::Display for ImmaSize { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| ImmaSize::M8N8K16 => write!(f, ".m8n8k16"), |
| ImmaSize::M8N8K32 => write!(f, ".m8n8k32"), |
| ImmaSize::M16N8K16 => write!(f, ".m16n8k16"), |
| ImmaSize::M16N8K32 => write!(f, ".m16n8k32"), |
| ImmaSize::M16N8K64 => write!(f, ".m16n8k64"), |
| } |
| } |
| } |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpImma { |
| #[dst_type(Vec)] |
| pub dst: Dst, |
| |
| pub mat_size: ImmaSize, |
| pub src_types: [IntType; 2], |
| pub saturate: bool, |
| |
| #[src_type(SSA)] |
| pub srcs: [Src; 3], |
| } |
| |
| impl DisplayOp for OpImma { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let sat = if self.saturate { ".sat" } else { "" }; |
| write!( |
| f, |
| "imma{}{}{}{sat} {} {} {}", |
| self.mat_size, |
| self.src_types[0], |
| self.src_types[1], |
| self.srcs[0], |
| self.srcs[1], |
| self.srcs[2], |
| ) |
| } |
| } |
| |
| impl_display_for_op!(OpImma); |
| |
| #[derive(Clone, Copy, Eq, PartialEq)] |
| #[allow(dead_code)] |
| pub enum HmmaSize { |
| M16N8K16, |
| M16N8K8, |
| M16N8K4, |
| } |
| |
| impl fmt::Display for HmmaSize { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| HmmaSize::M16N8K16 => write!(f, ".m16n8k16"), |
| HmmaSize::M16N8K8 => write!(f, ".m16n8k8"), |
| HmmaSize::M16N8K4 => write!(f, ".m16n8k4"), |
| } |
| } |
| } |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpHmma { |
| #[dst_type(Vec)] |
| pub dst: Dst, |
| |
| pub mat_size: HmmaSize, |
| pub src_type: FloatType, |
| pub dst_type: FloatType, |
| |
| #[src_type(SSA)] |
| pub srcs: [Src; 3], |
| } |
| |
| impl DisplayOp for OpHmma { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "hmma{}{} {} {} {}", |
| self.mat_size, |
| self.dst_type, |
| self.srcs[0], |
| self.srcs[1], |
| self.srcs[2], |
| ) |
| } |
| } |
| |
| impl_display_for_op!(OpHmma); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpHFma2 { |
| #[dst_type(F16v2)] |
| pub dst: Dst, |
| |
| #[src_type(F16v2)] |
| pub srcs: [Src; 3], |
| |
| pub saturate: bool, |
| pub ftz: bool, |
| pub dnz: bool, |
| pub f32: bool, |
| } |
| |
| impl DisplayOp for OpHFma2 { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let sat = if self.saturate { ".sat" } else { "" }; |
| let f32 = if self.f32 { ".f32" } else { "" }; |
| write!(f, "hfma2{sat}{f32}")?; |
| if self.dnz { |
| write!(f, ".dnz")?; |
| } else if self.ftz { |
| write!(f, ".ftz")?; |
| } |
| write!(f, " {} {} {}", self.srcs[0], self.srcs[1], self.srcs[2]) |
| } |
| } |
| impl_display_for_op!(OpHFma2); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpHMnMx2 { |
| #[dst_type(F16v2)] |
| pub dst: Dst, |
| |
| #[src_type(F16v2)] |
| pub srcs: [Src; 2], |
| |
| #[src_type(Pred)] |
| pub min: Src, |
| |
| pub ftz: bool, |
| } |
| |
| impl DisplayOp for OpHMnMx2 { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let ftz = if self.ftz { ".ftz" } else { "" }; |
| write!( |
| f, |
| "hmnmx2{ftz} {} {} {}", |
| self.srcs[0], self.srcs[1], self.min |
| ) |
| } |
| } |
| impl_display_for_op!(OpHMnMx2); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpBMsk { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[src_type(ALU)] |
| pub pos: Src, |
| |
| #[src_type(ALU)] |
| pub width: Src, |
| |
| pub wrap: bool, |
| } |
| |
| impl DisplayOp for OpBMsk { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let wrap = if self.wrap { ".wrap" } else { ".clamp" }; |
| write!(f, "bmsk{} {} {}", wrap, self.pos, self.width) |
| } |
| } |
| impl_display_for_op!(OpBMsk); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpBRev { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[src_type(ALU)] |
| pub src: Src, |
| } |
| |
| impl DisplayOp for OpBRev { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "brev {}", self.src) |
| } |
| } |
| impl_display_for_op!(OpBRev); |
| |
| /// Bitfield extract. Extracts all bits from `base` starting at `offset` into |
| /// `dst`. |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpBfe { |
| /// Where to insert the bits. |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| /// The source of bits to extract. |
| #[src_type(ALU)] |
| pub base: Src, |
| |
| /// The range of bits to extract. This source is interpreted as four |
| /// separate bytes, [b0, b1, b2, b3]. |
| /// |
| /// b0 and b1: unused |
| /// b2: the number of bits to extract. |
| /// b3: the offset of the first bit to extract. |
| /// |
| /// This matches the way the hardware works. |
| #[src_type(ALU)] |
| pub range: Src, |
| |
| /// Whether the output is signed |
| pub signed: bool, |
| |
| /// Whether to reverse the bits before inserting them into `dst`. |
| pub reverse: bool, |
| } |
| |
| impl DisplayOp for OpBfe { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "bfe")?; |
| if self.signed { |
| write!(f, ".s")?; |
| } |
| if self.reverse { |
| write!(f, ".rev")?; |
| } |
| write!(f, " {} {}", self.base, self.range,) |
| } |
| } |
| impl_display_for_op!(OpBfe); |
| |
| #[repr(C)] |
| #[derive(Clone, SrcsAsSlice, DstsAsSlice)] |
| pub struct OpFlo { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[src_type(ALU)] |
| pub src: Src, |
| |
| pub signed: bool, |
| pub return_shift_amount: bool, |
| } |
| |
| impl Foldable for OpFlo { |
| fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let src = f.get_u32_src(self, &self.src); |
| let leading = if self.signed && (src & 0x80000000) != 0 { |
| (!src).leading_zeros() |
| } else { |
| src.leading_zeros() |
| }; |
| let dst = if self.return_shift_amount { |
| leading |
| } else { |
| 31 - leading |
| }; |
| f.set_u32_dst(self, &self.dst, dst); |
| } |
| } |
| |
| impl DisplayOp for OpFlo { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "flo")?; |
| if self.return_shift_amount { |
| write!(f, ".samt")?; |
| } |
| write!(f, " {}", self.src) |
| } |
| } |
| impl_display_for_op!(OpFlo); |
| |
| #[repr(C)] |
| #[derive(Clone, SrcsAsSlice, DstsAsSlice)] |
| pub struct OpIAbs { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[src_type(ALU)] |
| pub src: Src, |
| } |
| |
| impl Foldable for OpIAbs { |
| fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let src = f.get_u32_src(self, &self.src); |
| let dst = (src as i32).unsigned_abs(); |
| f.set_u32_dst(self, &self.dst, dst); |
| } |
| } |
| |
| impl DisplayOp for OpIAbs { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "iabs {}", self.src) |
| } |
| } |
| impl_display_for_op!(OpIAbs); |
| |
| /// Only used on SM50 |
| #[repr(C)] |
| #[derive(Clone, SrcsAsSlice, DstsAsSlice)] |
| pub struct OpIAdd2 { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| #[dst_type(Carry)] |
| pub carry_out: Dst, |
| |
| #[src_type(I32)] |
| pub srcs: [Src; 2], |
| } |
| |
| impl Foldable for OpIAdd2 { |
| fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let srcs = [ |
| f.get_u32_src(self, &self.srcs[0]), |
| f.get_u32_src(self, &self.srcs[1]), |
| ]; |
| |
| let mut sum = 0_u64; |
| for i in 0..2 { |
| if self.srcs[i].src_mod.is_ineg() { |
| // This is a very literal interpretation of 2's compliment. |
| // This is not -u64::from(src) or u64::from(-src). |
| sum += u64::from(!srcs[i]) + 1; |
| } else { |
| sum += u64::from(srcs[i]); |
| } |
| } |
| |
| f.set_u32_dst(self, &self.dst, sum as u32); |
| f.set_carry_dst(self, &self.carry_out, sum >= (1 << 32)); |
| } |
| } |
| |
| impl DisplayOp for OpIAdd2 { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "iadd2 {} {}", self.srcs[0], self.srcs[1]) |
| } |
| } |
| |
| /// Only used on SM50 |
| #[repr(C)] |
| #[derive(Clone, SrcsAsSlice, DstsAsSlice)] |
| pub struct OpIAdd2X { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| #[dst_type(Carry)] |
| pub carry_out: Dst, |
| |
| #[src_type(B32)] |
| pub srcs: [Src; 2], |
| #[src_type(Carry)] |
| pub carry_in: Src, |
| } |
| |
| impl Foldable for OpIAdd2X { |
| fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let srcs = [ |
| f.get_u32_bnot_src(self, &self.srcs[0]), |
| f.get_u32_bnot_src(self, &self.srcs[1]), |
| ]; |
| let carry_in = f.get_carry_src(self, &self.carry_in); |
| |
| let sum = u64::from(srcs[0]) + u64::from(srcs[1]) + u64::from(carry_in); |
| |
| f.set_u32_dst(self, &self.dst, sum as u32); |
| f.set_carry_dst(self, &self.carry_out, sum >= (1 << 32)); |
| } |
| } |
| |
| impl DisplayOp for OpIAdd2X { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "iadd2.x {} {}", self.srcs[0], self.srcs[1])?; |
| if !self.carry_in.is_zero() { |
| write!(f, " {}", self.carry_in)?; |
| } |
| Ok(()) |
| } |
| } |
| |
| #[repr(C)] |
| #[derive(Clone, SrcsAsSlice, DstsAsSlice)] |
| pub struct OpIAdd3 { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[dst_type(Pred)] |
| pub overflow: [Dst; 2], |
| |
| #[src_type(I32)] |
| pub srcs: [Src; 3], |
| } |
| |
| impl Foldable for OpIAdd3 { |
| fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let srcs = [ |
| f.get_u32_src(self, &self.srcs[0]), |
| f.get_u32_src(self, &self.srcs[1]), |
| f.get_u32_src(self, &self.srcs[2]), |
| ]; |
| |
| let mut sum = 0_u64; |
| for i in 0..3 { |
| if self.srcs[i].src_mod.is_ineg() { |
| // This is a very literal interpretation of 2's compliment. |
| // This is not -u64::from(src) or u64::from(-src). |
| sum += u64::from(!srcs[i]) + 1; |
| } else { |
| sum += u64::from(srcs[i]); |
| } |
| } |
| |
| f.set_u32_dst(self, &self.dst, sum as u32); |
| f.set_pred_dst(self, &self.overflow[0], sum >= 1_u64 << 32); |
| f.set_pred_dst(self, &self.overflow[1], sum >= 2_u64 << 32); |
| } |
| } |
| |
| impl DisplayOp for OpIAdd3 { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "iadd3 {} {} {}", |
| self.srcs[0], self.srcs[1], self.srcs[2], |
| ) |
| } |
| } |
| impl_display_for_op!(OpIAdd3); |
| |
| #[repr(C)] |
| #[derive(Clone, SrcsAsSlice, DstsAsSlice)] |
| pub struct OpIAdd3X { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[dst_type(Pred)] |
| pub overflow: [Dst; 2], |
| |
| #[src_type(B32)] |
| pub srcs: [Src; 3], |
| |
| #[src_type(Pred)] |
| pub carry: [Src; 2], |
| } |
| |
| impl Foldable for OpIAdd3X { |
| fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let srcs = [ |
| f.get_u32_bnot_src(self, &self.srcs[0]), |
| f.get_u32_bnot_src(self, &self.srcs[1]), |
| f.get_u32_bnot_src(self, &self.srcs[2]), |
| ]; |
| let carry = [ |
| f.get_pred_src(self, &self.carry[0]), |
| f.get_pred_src(self, &self.carry[1]), |
| ]; |
| |
| let mut sum = 0_u64; |
| for i in 0..3 { |
| sum += u64::from(srcs[i]); |
| } |
| |
| for i in 0..2 { |
| sum += u64::from(carry[i]); |
| } |
| |
| f.set_u32_dst(self, &self.dst, sum as u32); |
| f.set_pred_dst(self, &self.overflow[0], sum >= 1_u64 << 32); |
| f.set_pred_dst(self, &self.overflow[1], sum >= 2_u64 << 32); |
| } |
| } |
| |
| impl DisplayOp for OpIAdd3X { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "iadd3.x {} {} {} {} {}", |
| self.srcs[0], |
| self.srcs[1], |
| self.srcs[2], |
| self.carry[0], |
| self.carry[1] |
| ) |
| } |
| } |
| impl_display_for_op!(OpIAdd3X); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpIDp4 { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| pub src_types: [IntType; 2], |
| |
| #[src_type(I32)] |
| pub srcs: [Src; 3], |
| } |
| |
| impl DisplayOp for OpIDp4 { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "idp4{}{} {} {} {}", |
| self.src_types[0], |
| self.src_types[1], |
| self.srcs[0], |
| self.srcs[1], |
| self.srcs[2], |
| ) |
| } |
| } |
| impl_display_for_op!(OpIDp4); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpIMad { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[src_type(ALU)] |
| pub srcs: [Src; 3], |
| |
| pub signed: bool, |
| } |
| |
| impl DisplayOp for OpIMad { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "imad {} {} {}", self.srcs[0], self.srcs[1], self.srcs[2],) |
| } |
| } |
| impl_display_for_op!(OpIMad); |
| |
| /// Only used on SM50 |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpIMul { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[src_type(ALU)] |
| pub srcs: [Src; 2], |
| |
| pub signed: [bool; 2], |
| pub high: bool, |
| } |
| |
| impl DisplayOp for OpIMul { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "imul")?; |
| if self.high { |
| write!(f, ".hi")?; |
| } |
| let src_type = |signed| if signed { ".s32" } else { ".u32" }; |
| write!( |
| f, |
| "{}{}", |
| src_type(self.signed[0]), |
| src_type(self.signed[1]) |
| )?; |
| write!(f, " {} {}", self.srcs[0], self.srcs[1]) |
| } |
| } |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpIMad64 { |
| #[dst_type(Vec)] |
| pub dst: Dst, |
| |
| #[src_type(ALU)] |
| pub srcs: [Src; 3], |
| |
| pub signed: bool, |
| } |
| |
| impl DisplayOp for OpIMad64 { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "imad64 {} {} {}", |
| self.srcs[0], self.srcs[1], self.srcs[2], |
| ) |
| } |
| } |
| impl_display_for_op!(OpIMad64); |
| |
| #[repr(C)] |
| #[derive(Clone, SrcsAsSlice, DstsAsSlice)] |
| pub struct OpIMnMx { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| pub cmp_type: IntCmpType, |
| |
| #[src_type(ALU)] |
| pub srcs: [Src; 2], |
| |
| #[src_type(Pred)] |
| pub min: Src, |
| } |
| |
| impl Foldable for OpIMnMx { |
| fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let (a, b) = ( |
| f.get_u32_bnot_src(self, &self.srcs[0]), |
| f.get_u32_bnot_src(self, &self.srcs[1]), |
| ); |
| let min = f.get_pred_src(self, &self.min); |
| |
| let res = match (min, self.cmp_type) { |
| (true, IntCmpType::U32) => a.min(b), |
| (true, IntCmpType::I32) => (a as i32).min(b as i32) as u32, |
| (false, IntCmpType::U32) => a.max(b), |
| (false, IntCmpType::I32) => (a as i32).max(b as i32) as u32, |
| }; |
| |
| f.set_u32_dst(self, &self.dst, res); |
| } |
| } |
| |
| impl DisplayOp for OpIMnMx { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "imnmx{} {} {} {}", |
| self.cmp_type, self.srcs[0], self.srcs[1], self.min |
| ) |
| } |
| } |
| impl_display_for_op!(OpIMnMx); |
| |
| #[repr(C)] |
| #[derive(Clone, SrcsAsSlice, DstsAsSlice)] |
| pub struct OpISetP { |
| #[dst_type(Pred)] |
| pub dst: Dst, |
| |
| pub set_op: PredSetOp, |
| pub cmp_op: IntCmpOp, |
| pub cmp_type: IntCmpType, |
| pub ex: bool, |
| |
| #[src_type(ALU)] |
| pub srcs: [Src; 2], |
| |
| #[src_type(Pred)] |
| pub accum: Src, |
| |
| #[src_type(Pred)] |
| pub low_cmp: Src, |
| } |
| |
| impl Foldable for OpISetP { |
| fn fold(&self, sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let x = f.get_u32_src(self, &self.srcs[0]); |
| let y = f.get_u32_src(self, &self.srcs[1]); |
| let accum = f.get_pred_src(self, &self.accum); |
| let low_cmp = f.get_pred_src(self, &self.low_cmp); |
| |
| let cmp = if self.cmp_type.is_signed() { |
| let x = x as i32; |
| let y = y as i32; |
| match &self.cmp_op { |
| IntCmpOp::False => false, |
| IntCmpOp::True => true, |
| IntCmpOp::Eq => x == y, |
| IntCmpOp::Ne => x != y, |
| IntCmpOp::Lt => x < y, |
| IntCmpOp::Le => x <= y, |
| IntCmpOp::Gt => x > y, |
| IntCmpOp::Ge => x >= y, |
| } |
| } else { |
| match &self.cmp_op { |
| IntCmpOp::False => false, |
| IntCmpOp::True => true, |
| IntCmpOp::Eq => x == y, |
| IntCmpOp::Ne => x != y, |
| IntCmpOp::Lt => x < y, |
| IntCmpOp::Le => x <= y, |
| IntCmpOp::Gt => x > y, |
| IntCmpOp::Ge => x >= y, |
| } |
| }; |
| |
| let cmp_op_is_const = |
| matches!(self.cmp_op, IntCmpOp::False | IntCmpOp::True); |
| let cmp = if self.ex && x == y && !cmp_op_is_const { |
| // Pre-Volta, isetp.x takes the accumulator into account. If we |
| // want to support this, we need to take an an accumulator into |
| // account. Disallow it for now. |
| assert!(sm.sm() >= 70); |
| low_cmp |
| } else { |
| cmp |
| }; |
| |
| let dst = self.set_op.eval(cmp, accum); |
| |
| f.set_pred_dst(self, &self.dst, dst); |
| } |
| } |
| |
| impl DisplayOp for OpISetP { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "isetp{}{}", self.cmp_op, self.cmp_type)?; |
| if !self.set_op.is_trivial(&self.accum) { |
| write!(f, "{}", self.set_op)?; |
| } |
| if self.ex { |
| write!(f, ".ex")?; |
| } |
| write!(f, " {} {}", self.srcs[0], self.srcs[1])?; |
| if !self.set_op.is_trivial(&self.accum) { |
| write!(f, " {}", self.accum)?; |
| } |
| if self.ex { |
| write!(f, " {}", self.low_cmp)?; |
| } |
| Ok(()) |
| } |
| } |
| impl_display_for_op!(OpISetP); |
| |
| #[repr(C)] |
| #[derive(Clone, SrcsAsSlice, DstsAsSlice)] |
| pub struct OpLea { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[dst_type(Pred)] |
| pub overflow: Dst, |
| |
| #[src_type(ALU)] |
| pub a: Src, |
| |
| #[src_type(I32)] |
| pub b: Src, |
| |
| #[src_type(ALU)] |
| pub a_high: Src, // High 32-bits of a if .dst_high is set |
| |
| pub shift: u8, |
| pub dst_high: bool, |
| pub intermediate_mod: SrcMod, // Modifier for shifted temporary (a << shift) |
| } |
| |
| impl Foldable for OpLea { |
| fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let a = f.get_u32_src(self, &self.a); |
| let mut b = f.get_u32_src(self, &self.b); |
| let a_high = f.get_u32_src(self, &self.a_high); |
| |
| let mut overflow = false; |
| |
| let mut shift_result = if self.dst_high { |
| let a = a as u64; |
| let a_high = a_high as u64; |
| let a = (a_high << 32) | a; |
| |
| (a >> (32 - self.shift)) as u32 |
| } else { |
| a << self.shift |
| }; |
| |
| if self.intermediate_mod.is_ineg() { |
| let o; |
| (shift_result, o) = u32::overflowing_add(!shift_result, 1); |
| overflow |= o; |
| } |
| |
| if self.b.src_mod.is_ineg() { |
| let o; |
| (b, o) = u32::overflowing_add(!b, 1); |
| overflow |= o; |
| } |
| |
| let (dst, o) = u32::overflowing_add(shift_result, b); |
| overflow |= o; |
| |
| f.set_u32_dst(self, &self.dst, dst); |
| f.set_pred_dst(self, &self.overflow, overflow); |
| } |
| } |
| |
| impl DisplayOp for OpLea { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "lea")?; |
| if self.dst_high { |
| write!(f, ".hi")?; |
| } |
| write!(f, " {} {} {}", self.a, self.shift, self.b)?; |
| if self.dst_high { |
| write!(f, " {}", self.a_high)?; |
| } |
| Ok(()) |
| } |
| } |
| impl_display_for_op!(OpLea); |
| |
| #[repr(C)] |
| #[derive(Clone, SrcsAsSlice, DstsAsSlice)] |
| pub struct OpLeaX { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[dst_type(Pred)] |
| pub overflow: Dst, |
| |
| #[src_type(ALU)] |
| pub a: Src, |
| |
| #[src_type(B32)] |
| pub b: Src, |
| |
| #[src_type(ALU)] |
| pub a_high: Src, // High 32-bits of a if .dst_high is set |
| |
| #[src_type(Pred)] |
| pub carry: Src, |
| |
| pub shift: u8, |
| pub dst_high: bool, |
| pub intermediate_mod: SrcMod, // Modifier for shifted temporary (a << shift) |
| } |
| |
| impl Foldable for OpLeaX { |
| fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let a = f.get_u32_src(self, &self.a); |
| let mut b = f.get_u32_src(self, &self.b); |
| let a_high = f.get_u32_src(self, &self.a_high); |
| let carry = f.get_pred_src(self, &self.carry); |
| |
| let mut overflow = false; |
| |
| let mut shift_result = if self.dst_high { |
| let a = a as u64; |
| let a_high = a_high as u64; |
| let a = (a_high << 32) | a; |
| |
| (a >> (32 - self.shift)) as u32 |
| } else { |
| a << self.shift |
| }; |
| |
| if self.intermediate_mod.is_bnot() { |
| shift_result = !shift_result; |
| } |
| |
| if self.b.src_mod.is_bnot() { |
| b = !b; |
| } |
| |
| let (dst, o) = u32::overflowing_add(shift_result, b); |
| overflow |= o; |
| |
| let (dst, o) = u32::overflowing_add(dst, if carry { 1 } else { 0 }); |
| overflow |= o; |
| |
| f.set_u32_dst(self, &self.dst, dst); |
| f.set_pred_dst(self, &self.overflow, overflow); |
| } |
| } |
| |
| impl DisplayOp for OpLeaX { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "lea.x")?; |
| if self.dst_high { |
| write!(f, ".hi")?; |
| } |
| write!(f, " {} {} {}", self.a, self.shift, self.b)?; |
| if self.dst_high { |
| write!(f, " {}", self.a_high)?; |
| } |
| write!(f, " {}", self.carry)?; |
| Ok(()) |
| } |
| } |
| impl_display_for_op!(OpLeaX); |
| |
| #[repr(C)] |
| #[derive(Clone, SrcsAsSlice, DstsAsSlice)] |
| pub struct OpLop2 { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[src_type(B32)] |
| pub srcs: [Src; 2], |
| |
| pub op: LogicOp2, |
| } |
| |
| impl DisplayOp for OpLop2 { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "lop2.{} {} {}", self.op, self.srcs[0], self.srcs[1],) |
| } |
| } |
| |
| impl Foldable for OpLop2 { |
| fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let srcs = [ |
| f.get_u32_bnot_src(self, &self.srcs[0]), |
| f.get_u32_bnot_src(self, &self.srcs[1]), |
| ]; |
| let dst = match self.op { |
| LogicOp2::And => srcs[0] & srcs[1], |
| LogicOp2::Or => srcs[0] | srcs[1], |
| LogicOp2::Xor => srcs[0] ^ srcs[1], |
| LogicOp2::PassB => srcs[1], |
| }; |
| f.set_u32_dst(self, &self.dst, dst); |
| } |
| } |
| |
| #[repr(C)] |
| #[derive(Clone, SrcsAsSlice, DstsAsSlice)] |
| pub struct OpLop3 { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[src_type(ALU)] |
| pub srcs: [Src; 3], |
| |
| pub op: LogicOp3, |
| } |
| |
| impl Foldable for OpLop3 { |
| fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let srcs = [ |
| f.get_u32_bnot_src(self, &self.srcs[0]), |
| f.get_u32_bnot_src(self, &self.srcs[1]), |
| f.get_u32_bnot_src(self, &self.srcs[2]), |
| ]; |
| let dst = self.op.eval(srcs[0], srcs[1], srcs[2]); |
| f.set_u32_dst(self, &self.dst, dst); |
| } |
| } |
| |
| impl DisplayOp for OpLop3 { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "lop3.{} {} {} {}", |
| self.op, self.srcs[0], self.srcs[1], self.srcs[2], |
| ) |
| } |
| } |
| impl_display_for_op!(OpLop3); |
| |
| #[derive(Clone, Copy, Eq, PartialEq)] |
| pub enum ShflOp { |
| Idx, |
| Up, |
| Down, |
| Bfly, |
| } |
| |
| impl fmt::Display for ShflOp { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| ShflOp::Idx => write!(f, "idx"), |
| ShflOp::Up => write!(f, "up"), |
| ShflOp::Down => write!(f, "down"), |
| ShflOp::Bfly => write!(f, "bfly"), |
| } |
| } |
| } |
| |
| #[repr(C)] |
| #[derive(Clone, SrcsAsSlice, DstsAsSlice)] |
| pub struct OpShf { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[src_type(GPR)] |
| pub low: Src, |
| |
| #[src_type(ALU)] |
| pub high: Src, |
| |
| #[src_type(ALU)] |
| pub shift: Src, |
| |
| pub right: bool, |
| pub wrap: bool, |
| pub data_type: IntType, |
| pub dst_high: bool, |
| } |
| |
| fn reduce_shift_imm(shift: &mut Src, wrap: bool, bits: u32) { |
| debug_assert!(shift.src_mod.is_none()); |
| if let SrcRef::Imm32(shift) = &mut shift.src_ref { |
| if wrap { |
| *shift = *shift & (bits - 1); |
| } else { |
| *shift = std::cmp::min(*shift, bits) |
| } |
| } |
| } |
| |
| impl OpShf { |
| /// Reduces the shift immediate, if any. Out-of-range shifts are either |
| /// clamped to the maximum or wrapped as needed. |
| pub fn reduce_shift_imm(&mut self) { |
| let bits = self.data_type.bits().try_into().unwrap(); |
| reduce_shift_imm(&mut self.shift, self.wrap, bits); |
| } |
| } |
| |
| impl Foldable for OpShf { |
| fn fold(&self, sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let low = f.get_u32_src(self, &self.low); |
| let high = f.get_u32_src(self, &self.high); |
| let shift = f.get_u32_src(self, &self.shift); |
| |
| let bits: u32 = self.data_type.bits().try_into().unwrap(); |
| let shift = if self.wrap { |
| shift & (bits - 1) |
| } else { |
| min(shift, bits) |
| }; |
| |
| let x = u64::from(low) | (u64::from(high) << 32); |
| let shifted = if sm.sm() < 70 |
| && self.dst_high |
| && self.data_type != IntType::I64 |
| { |
| if self.right { |
| x.checked_shr(shift).unwrap_or(0) |
| } else { |
| x.checked_shl(shift).unwrap_or(0) |
| } |
| } else if self.data_type.is_signed() { |
| if self.right { |
| let x = x as i64; |
| x.checked_shr(shift).unwrap_or(x >> 63) as u64 |
| } else { |
| x.checked_shl(shift).unwrap_or(0) |
| } |
| } else { |
| if self.right { |
| x.checked_shr(shift).unwrap_or(0) |
| } else { |
| x.checked_shl(shift).unwrap_or(0) |
| } |
| }; |
| |
| let dst = if (sm.sm() < 70 && !self.right) || self.dst_high { |
| (shifted >> 32) as u32 |
| } else { |
| shifted as u32 |
| }; |
| |
| f.set_u32_dst(self, &self.dst, dst); |
| } |
| } |
| |
| impl DisplayOp for OpShf { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "shf")?; |
| if self.right { |
| write!(f, ".r")?; |
| } else { |
| write!(f, ".l")?; |
| } |
| if self.wrap { |
| write!(f, ".w")?; |
| } |
| write!(f, "{}", self.data_type)?; |
| if self.dst_high { |
| write!(f, ".hi")?; |
| } |
| write!(f, " {} {} {}", self.low, self.high, self.shift) |
| } |
| } |
| impl_display_for_op!(OpShf); |
| |
| /// Only used on SM50 |
| #[repr(C)] |
| #[derive(Clone, SrcsAsSlice, DstsAsSlice)] |
| pub struct OpShl { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[src_type(GPR)] |
| pub src: Src, |
| |
| #[src_type(ALU)] |
| pub shift: Src, |
| |
| pub wrap: bool, |
| } |
| |
| impl OpShl { |
| /// Reduces the shift immediate, if any. Out-of-range shifts are either |
| /// clamped to the maximum or wrapped as needed. |
| pub fn reduce_shift_imm(&mut self) { |
| reduce_shift_imm(&mut self.shift, self.wrap, 32); |
| } |
| } |
| |
| impl DisplayOp for OpShl { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "shl")?; |
| if self.wrap { |
| write!(f, ".w")?; |
| } |
| write!(f, " {} {}", self.src, self.shift) |
| } |
| } |
| |
| impl Foldable for OpShl { |
| fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let x = f.get_u32_src(self, &self.src); |
| let shift = f.get_u32_src(self, &self.shift); |
| |
| let shift = if self.wrap { |
| shift & 31 |
| } else { |
| min(shift, 32) |
| }; |
| let dst = x.checked_shl(shift).unwrap_or(0); |
| f.set_u32_dst(self, &self.dst, dst); |
| } |
| } |
| |
| /// Only used on SM50 |
| #[repr(C)] |
| #[derive(Clone, SrcsAsSlice, DstsAsSlice)] |
| pub struct OpShr { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[src_type(GPR)] |
| pub src: Src, |
| |
| #[src_type(ALU)] |
| pub shift: Src, |
| |
| pub wrap: bool, |
| pub signed: bool, |
| } |
| |
| impl DisplayOp for OpShr { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "shr")?; |
| if self.wrap { |
| write!(f, ".w")?; |
| } |
| if !self.signed { |
| write!(f, ".u32")?; |
| } |
| write!(f, " {} {}", self.src, self.shift) |
| } |
| } |
| |
| impl OpShr { |
| /// Reduces the shift immediate, if any. Out-of-range shifts are either |
| /// clamped to the maximum or wrapped as needed. |
| pub fn reduce_shift_imm(&mut self) { |
| reduce_shift_imm(&mut self.shift, self.wrap, 32); |
| } |
| } |
| |
| impl Foldable for OpShr { |
| fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let x = f.get_u32_src(self, &self.src); |
| let shift = f.get_u32_src(self, &self.shift); |
| |
| let shift = if self.wrap { |
| shift & 31 |
| } else { |
| min(shift, 32) |
| }; |
| let dst = if self.signed { |
| let x = x as i32; |
| x.checked_shr(shift).unwrap_or(x >> 31) as u32 |
| } else { |
| x.checked_shr(shift).unwrap_or(0) |
| }; |
| f.set_u32_dst(self, &self.dst, dst); |
| } |
| } |
| |
| #[repr(C)] |
| pub struct OpF2F { |
| pub dst: Dst, |
| pub src: Src, |
| |
| pub src_type: FloatType, |
| pub dst_type: FloatType, |
| pub rnd_mode: FRndMode, |
| pub ftz: bool, |
| /// For 16-bit up-conversions, take the high 16 bits of the source register. |
| /// For 16-bit down-conversions, place the result into the upper 16 bits of |
| /// the destination register |
| pub high: bool, |
| /// Round to the nearest integer rather than nearest float |
| /// |
| /// Not available on SM70+ |
| pub integer_rnd: bool, |
| } |
| |
| impl AsSlice<Src> for OpF2F { |
| type Attr = SrcType; |
| |
| fn as_slice(&self) -> &[Src] { |
| std::slice::from_ref(&self.src) |
| } |
| |
| fn as_mut_slice(&mut self) -> &mut [Src] { |
| std::slice::from_mut(&mut self.src) |
| } |
| |
| fn attrs(&self) -> SrcTypeList { |
| let src_type = match self.src_type { |
| FloatType::F16 => SrcType::F16, |
| FloatType::F32 => SrcType::F32, |
| FloatType::F64 => SrcType::F64, |
| }; |
| SrcTypeList::Uniform(src_type) |
| } |
| } |
| |
| impl AsSlice<Dst> for OpF2F { |
| type Attr = DstType; |
| |
| fn as_slice(&self) -> &[Dst] { |
| std::slice::from_ref(&self.dst) |
| } |
| |
| fn as_mut_slice(&mut self) -> &mut [Dst] { |
| std::slice::from_mut(&mut self.dst) |
| } |
| |
| fn attrs(&self) -> DstTypeList { |
| let dst_type = match self.dst_type { |
| FloatType::F16 => DstType::F16, |
| FloatType::F32 => DstType::F32, |
| FloatType::F64 => DstType::F64, |
| }; |
| DstTypeList::Uniform(dst_type) |
| } |
| } |
| |
| impl DisplayOp for OpF2F { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "f2f")?; |
| if self.ftz { |
| write!(f, ".ftz")?; |
| } |
| if self.integer_rnd { |
| write!(f, ".int")?; |
| } |
| write!( |
| f, |
| "{}{}{} {}", |
| self.dst_type, self.src_type, self.rnd_mode, self.src, |
| ) |
| } |
| } |
| impl_display_for_op!(OpF2F); |
| |
| #[repr(C)] |
| #[derive(DstsAsSlice, SrcsAsSlice)] |
| pub struct OpF2FP { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[src_type(ALU)] |
| pub srcs: [Src; 2], |
| |
| pub rnd_mode: FRndMode, |
| } |
| |
| impl DisplayOp for OpF2FP { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "f2fp.pack_ab")?; |
| if self.rnd_mode != FRndMode::NearestEven { |
| write!(f, "{}", self.rnd_mode)?; |
| } |
| write!(f, " {}, {}", self.srcs[0], self.srcs[1],) |
| } |
| } |
| impl_display_for_op!(OpF2FP); |
| |
| #[repr(C)] |
| #[derive(DstsAsSlice)] |
| pub struct OpF2I { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| pub src: Src, |
| |
| pub src_type: FloatType, |
| pub dst_type: IntType, |
| pub rnd_mode: FRndMode, |
| pub ftz: bool, |
| } |
| |
| impl AsSlice<Src> for OpF2I { |
| type Attr = SrcType; |
| |
| fn as_slice(&self) -> &[Src] { |
| std::slice::from_ref(&self.src) |
| } |
| |
| fn as_mut_slice(&mut self) -> &mut [Src] { |
| std::slice::from_mut(&mut self.src) |
| } |
| |
| fn attrs(&self) -> SrcTypeList { |
| let src_type = match self.src_type { |
| FloatType::F16 => SrcType::F16, |
| FloatType::F32 => SrcType::F32, |
| FloatType::F64 => SrcType::F64, |
| }; |
| SrcTypeList::Uniform(src_type) |
| } |
| } |
| |
| impl DisplayOp for OpF2I { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let ftz = if self.ftz { ".ftz" } else { "" }; |
| write!( |
| f, |
| "f2i{}{}{}{ftz} {}", |
| self.dst_type, self.src_type, self.rnd_mode, self.src, |
| ) |
| } |
| } |
| impl_display_for_op!(OpF2I); |
| |
| #[repr(C)] |
| pub struct OpI2F { |
| pub dst: Dst, |
| pub src: Src, |
| |
| pub dst_type: FloatType, |
| pub src_type: IntType, |
| pub rnd_mode: FRndMode, |
| } |
| |
| impl AsSlice<Src> for OpI2F { |
| type Attr = SrcType; |
| |
| fn as_slice(&self) -> &[Src] { |
| std::slice::from_ref(&self.src) |
| } |
| |
| fn as_mut_slice(&mut self) -> &mut [Src] { |
| std::slice::from_mut(&mut self.src) |
| } |
| |
| fn attrs(&self) -> SrcTypeList { |
| if self.src_type.bits() <= 32 { |
| SrcTypeList::Uniform(SrcType::ALU) |
| } else { |
| SrcTypeList::Uniform(SrcType::GPR) |
| } |
| } |
| } |
| |
| impl AsSlice<Dst> for OpI2F { |
| type Attr = DstType; |
| |
| fn as_slice(&self) -> &[Dst] { |
| std::slice::from_ref(&self.dst) |
| } |
| |
| fn as_mut_slice(&mut self) -> &mut [Dst] { |
| std::slice::from_mut(&mut self.dst) |
| } |
| |
| fn attrs(&self) -> DstTypeList { |
| let dst_type = match self.dst_type { |
| FloatType::F16 => DstType::F16, |
| FloatType::F32 => DstType::F32, |
| FloatType::F64 => DstType::F64, |
| }; |
| DstTypeList::Uniform(dst_type) |
| } |
| } |
| |
| impl DisplayOp for OpI2F { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "i2f{}{}{} {}", |
| self.dst_type, self.src_type, self.rnd_mode, self.src, |
| ) |
| } |
| } |
| impl_display_for_op!(OpI2F); |
| |
| /// Not used on SM70+ |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpI2I { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[src_type(ALU)] |
| pub src: Src, |
| |
| pub src_type: IntType, |
| pub dst_type: IntType, |
| |
| pub saturate: bool, |
| pub abs: bool, |
| pub neg: bool, |
| } |
| |
| impl DisplayOp for OpI2I { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "i2i")?; |
| if self.saturate { |
| write!(f, ".sat ")?; |
| } |
| write!(f, "{}{} {}", self.dst_type, self.src_type, self.src,)?; |
| if self.abs { |
| write!(f, ".abs")?; |
| } |
| if self.neg { |
| write!(f, ".neg")?; |
| } |
| Ok(()) |
| } |
| } |
| impl_display_for_op!(OpI2I); |
| |
| #[repr(C)] |
| #[derive(DstsAsSlice)] |
| pub struct OpFRnd { |
| #[dst_type(F32)] |
| pub dst: Dst, |
| |
| pub src: Src, |
| |
| pub dst_type: FloatType, |
| pub src_type: FloatType, |
| pub rnd_mode: FRndMode, |
| pub ftz: bool, |
| } |
| |
| impl AsSlice<Src> for OpFRnd { |
| type Attr = SrcType; |
| |
| fn as_slice(&self) -> &[Src] { |
| std::slice::from_ref(&self.src) |
| } |
| |
| fn as_mut_slice(&mut self) -> &mut [Src] { |
| std::slice::from_mut(&mut self.src) |
| } |
| |
| fn attrs(&self) -> SrcTypeList { |
| let src_type = match self.src_type { |
| FloatType::F16 => SrcType::F16, |
| FloatType::F32 => SrcType::F32, |
| FloatType::F64 => SrcType::F64, |
| }; |
| SrcTypeList::Uniform(src_type) |
| } |
| } |
| |
| impl DisplayOp for OpFRnd { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let ftz = if self.ftz { ".ftz" } else { "" }; |
| write!( |
| f, |
| "frnd{}{}{}{ftz} {}", |
| self.dst_type, self.src_type, self.rnd_mode, self.src, |
| ) |
| } |
| } |
| impl_display_for_op!(OpFRnd); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpMov { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[src_type(ALU)] |
| pub src: Src, |
| |
| pub quad_lanes: u8, |
| } |
| |
| impl DisplayOp for OpMov { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| if self.quad_lanes == 0xf { |
| write!(f, "mov {}", self.src) |
| } else { |
| write!(f, "mov[{:#x}] {}", self.quad_lanes, self.src) |
| } |
| } |
| } |
| impl_display_for_op!(OpMov); |
| |
| #[derive(Copy, Clone)] |
| pub struct PrmtSelByte(u8); |
| |
| impl PrmtSelByte { |
| pub const INVALID: PrmtSelByte = PrmtSelByte(u8::MAX); |
| |
| pub fn new(src_idx: usize, byte_idx: usize, msb: bool) -> PrmtSelByte { |
| assert!(src_idx < 2); |
| assert!(byte_idx < 4); |
| |
| let mut nib = 0; |
| nib |= (src_idx as u8) << 2; |
| nib |= byte_idx as u8; |
| if msb { |
| nib |= 0x8; |
| } |
| PrmtSelByte(nib) |
| } |
| |
| pub fn src(&self) -> usize { |
| ((self.0 >> 2) & 0x1).into() |
| } |
| |
| pub fn byte(&self) -> usize { |
| (self.0 & 0x3).into() |
| } |
| |
| pub fn msb(&self) -> bool { |
| (self.0 & 0x8) != 0 |
| } |
| |
| pub fn fold_u32(&self, u: u32) -> u8 { |
| let mut sb = (u >> (self.byte() * 8)) as u8; |
| if self.msb() { |
| sb = ((sb as i8) >> 7) as u8; |
| } |
| sb |
| } |
| } |
| |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub struct PrmtSel(pub u16); |
| |
| impl PrmtSel { |
| pub fn new(bytes: [PrmtSelByte; 4]) -> PrmtSel { |
| let mut sel = 0; |
| for i in 0..4 { |
| assert!(bytes[i].0 <= 0xf); |
| sel |= u16::from(bytes[i].0) << (i * 4); |
| } |
| PrmtSel(sel) |
| } |
| |
| pub fn get(&self, byte_idx: usize) -> PrmtSelByte { |
| assert!(byte_idx < 4); |
| PrmtSelByte(((self.0 >> (byte_idx * 4)) & 0xf) as u8) |
| } |
| } |
| |
| #[allow(dead_code)] |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub enum PrmtMode { |
| Index, |
| Forward4Extract, |
| Backward4Extract, |
| Replicate8, |
| EdgeClampLeft, |
| EdgeClampRight, |
| Replicate16, |
| } |
| |
| impl fmt::Display for PrmtMode { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| PrmtMode::Index => Ok(()), |
| PrmtMode::Forward4Extract => write!(f, ".f4e"), |
| PrmtMode::Backward4Extract => write!(f, ".b4e"), |
| PrmtMode::Replicate8 => write!(f, ".rc8"), |
| PrmtMode::EdgeClampLeft => write!(f, ".ecl"), |
| PrmtMode::EdgeClampRight => write!(f, ".ecl"), |
| PrmtMode::Replicate16 => write!(f, ".rc16"), |
| } |
| } |
| } |
| |
| #[repr(C)] |
| #[derive(Clone, SrcsAsSlice, DstsAsSlice)] |
| /// Permutes `srcs` into `dst` using `selection`. |
| pub struct OpPrmt { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[src_type(ALU)] |
| pub srcs: [Src; 2], |
| |
| #[src_type(ALU)] |
| pub sel: Src, |
| |
| pub mode: PrmtMode, |
| } |
| |
| impl OpPrmt { |
| pub fn get_sel(&self) -> Option<PrmtSel> { |
| // TODO: We could construct a PrmtSel for the other modes but we don't |
| // use them right now because they're kinda pointless. |
| if self.mode != PrmtMode::Index { |
| return None; |
| } |
| |
| self.sel.as_u32(SrcType::ALU).map(|sel| { |
| // The top 16 bits are ignored |
| PrmtSel(sel as u16) |
| }) |
| } |
| |
| /// Reduces the sel immediate, if any. |
| pub fn reduce_sel_imm(&mut self) { |
| assert!(self.sel.src_mod.is_none()); |
| if let SrcRef::Imm32(sel) = &mut self.sel.src_ref { |
| // Only the bottom 16 bits matter anyway |
| *sel &= 0xffff; |
| } |
| } |
| |
| pub fn as_u32(&self) -> Option<u32> { |
| let sel = self.get_sel()?; |
| |
| let mut imm = 0_u32; |
| for b in 0..4 { |
| let sel_byte = sel.get(b); |
| let src_u32 = self.srcs[sel_byte.src()].as_u32(SrcType::ALU)?; |
| |
| let sb = sel_byte.fold_u32(src_u32); |
| imm |= u32::from(sb) << (b * 8); |
| } |
| Some(imm) |
| } |
| } |
| |
| impl Foldable for OpPrmt { |
| fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let srcs = [ |
| f.get_u32_src(self, &self.srcs[0]), |
| f.get_u32_src(self, &self.srcs[1]), |
| ]; |
| let sel = f.get_u32_src(self, &self.sel); |
| |
| assert!(self.mode == PrmtMode::Index); |
| let sel = PrmtSel(sel as u16); |
| |
| let mut dst = 0_u32; |
| for b in 0..4 { |
| let sel_byte = sel.get(b); |
| let src = srcs[sel_byte.src()]; |
| let sb = sel_byte.fold_u32(src); |
| dst |= u32::from(sb) << (b * 8); |
| } |
| |
| f.set_u32_dst(self, &self.dst, dst); |
| } |
| } |
| |
| impl DisplayOp for OpPrmt { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "prmt{} {} [{}] {}", |
| self.mode, self.srcs[0], self.sel, self.srcs[1], |
| ) |
| } |
| } |
| impl_display_for_op!(OpPrmt); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpSel { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[src_type(Pred)] |
| pub cond: Src, |
| |
| #[src_type(ALU)] |
| pub srcs: [Src; 2], |
| } |
| |
| impl DisplayOp for OpSel { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "sel {} {} {}", self.cond, self.srcs[0], self.srcs[1],) |
| } |
| } |
| impl_display_for_op!(OpSel); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpShfl { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[dst_type(Pred)] |
| pub in_bounds: Dst, |
| |
| #[src_type(SSA)] |
| pub src: Src, |
| |
| #[src_type(ALU)] |
| pub lane: Src, |
| |
| #[src_type(ALU)] |
| pub c: Src, |
| |
| pub op: ShflOp, |
| } |
| |
| impl OpShfl { |
| /// Reduces the lane and c immediates, if any. The hardware only uses |
| /// some of the bits of `lane` and `c` and ignores the rest. This method |
| /// masks off the unused bits and ensures that any immediate values fit |
| /// in the limited encoding space in the instruction. |
| pub fn reduce_lane_c_imm(&mut self) { |
| debug_assert!(self.lane.src_mod.is_none()); |
| if let SrcRef::Imm32(lane) = &mut self.lane.src_ref { |
| *lane &= 0x1f; |
| } |
| |
| debug_assert!(self.c.src_mod.is_none()); |
| if let SrcRef::Imm32(c) = &mut self.c.src_ref { |
| *c &= 0x1f1f; |
| } |
| } |
| } |
| |
| impl DisplayOp for OpShfl { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "shfl.{} {} {} {}", self.op, self.src, self.lane, self.c) |
| } |
| } |
| impl_display_for_op!(OpShfl); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpPLop3 { |
| #[dst_type(Pred)] |
| pub dsts: [Dst; 2], |
| |
| #[src_type(Pred)] |
| pub srcs: [Src; 3], |
| |
| pub ops: [LogicOp3; 2], |
| } |
| |
| impl DisplayOp for OpPLop3 { |
| fn fmt_dsts(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "{} {}", self.dsts[0], self.dsts[1]) |
| } |
| |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "plop3 {} {} {} {} {}", |
| self.srcs[0], self.srcs[1], self.srcs[2], self.ops[0], self.ops[1], |
| ) |
| } |
| } |
| impl_display_for_op!(OpPLop3); |
| |
| #[repr(C)] |
| #[derive(Clone, SrcsAsSlice, DstsAsSlice)] |
| pub struct OpPSetP { |
| #[dst_type(Pred)] |
| pub dsts: [Dst; 2], |
| |
| pub ops: [PredSetOp; 2], |
| |
| #[src_type(Pred)] |
| pub srcs: [Src; 3], |
| } |
| |
| impl Foldable for OpPSetP { |
| fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let srcs = [ |
| f.get_pred_src(self, &self.srcs[0]), |
| f.get_pred_src(self, &self.srcs[1]), |
| f.get_pred_src(self, &self.srcs[2]), |
| ]; |
| |
| let tmp = self.ops[0].eval(srcs[0], srcs[1]); |
| let dst0 = self.ops[1].eval(srcs[2], tmp); |
| |
| let tmp = self.ops[0].eval(!srcs[0], srcs[1]); |
| let dst1 = self.ops[1].eval(srcs[2], tmp); |
| |
| f.set_pred_dst(self, &self.dsts[0], dst0); |
| f.set_pred_dst(self, &self.dsts[1], dst1); |
| } |
| } |
| |
| impl DisplayOp for OpPSetP { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "psetp{}{} {} {} {}", |
| self.ops[0], self.ops[1], self.srcs[0], self.srcs[1], self.srcs[2], |
| ) |
| } |
| } |
| |
| #[repr(C)] |
| #[derive(Clone, SrcsAsSlice, DstsAsSlice)] |
| pub struct OpPopC { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[src_type(B32)] |
| pub src: Src, |
| } |
| |
| impl Foldable for OpPopC { |
| fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let src = f.get_u32_bnot_src(self, &self.src); |
| let dst = src.count_ones(); |
| f.set_u32_dst(self, &self.dst, dst); |
| } |
| } |
| |
| impl DisplayOp for OpPopC { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "popc {}", self.src,) |
| } |
| } |
| impl_display_for_op!(OpPopC); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpR2UR { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[src_type(GPR)] |
| pub src: Src, |
| } |
| |
| impl DisplayOp for OpR2UR { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "r2ur {}", self.src) |
| } |
| } |
| impl_display_for_op!(OpR2UR); |
| |
| #[derive(Copy, Clone, PartialEq, Eq)] |
| pub enum ReduxOp { |
| And, |
| Or, |
| Xor, |
| Sum, |
| Min(IntCmpType), |
| Max(IntCmpType), |
| } |
| |
| impl fmt::Display for ReduxOp { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| ReduxOp::And => write!(f, ".and"), |
| ReduxOp::Or => write!(f, ".or"), |
| ReduxOp::Xor => write!(f, ".xor"), |
| ReduxOp::Sum => write!(f, ".sum"), |
| ReduxOp::Min(cmp) => write!(f, ".min{cmp}"), |
| ReduxOp::Max(cmp) => write!(f, ".max{cmp}"), |
| } |
| } |
| } |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpRedux { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[src_type(GPR)] |
| pub src: Src, |
| |
| pub op: ReduxOp, |
| } |
| |
| impl DisplayOp for OpRedux { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "redux{} {}", self.op, self.src) |
| } |
| } |
| impl_display_for_op!(OpRedux); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpTex { |
| pub dsts: [Dst; 2], |
| pub fault: Dst, |
| |
| pub tex: TexRef, |
| |
| #[src_type(SSA)] |
| pub srcs: [Src; 2], |
| |
| pub dim: TexDim, |
| pub lod_mode: TexLodMode, |
| pub deriv_mode: TexDerivMode, |
| pub z_cmpr: bool, |
| pub offset_mode: TexOffsetMode, |
| pub mem_eviction_priority: MemEvictionPriority, |
| pub nodep: bool, |
| pub channel_mask: ChannelMask, |
| } |
| |
| impl DisplayOp for OpTex { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "tex{}{}{}{}", |
| self.dim, self.lod_mode, self.offset_mode, self.deriv_mode |
| )?; |
| if self.z_cmpr { |
| write!(f, ".dc")?; |
| } |
| write!(f, "{}", self.mem_eviction_priority)?; |
| if self.nodep { |
| write!(f, ".nodep")?; |
| } |
| write!(f, "{}", self.channel_mask)?; |
| write!(f, " {} {} {}", self.tex, self.srcs[0], self.srcs[1]) |
| } |
| } |
| impl_display_for_op!(OpTex); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpTld { |
| pub dsts: [Dst; 2], |
| pub fault: Dst, |
| |
| pub tex: TexRef, |
| |
| #[src_type(SSA)] |
| pub srcs: [Src; 2], |
| |
| pub dim: TexDim, |
| pub is_ms: bool, |
| pub lod_mode: TexLodMode, |
| pub offset_mode: TexOffsetMode, |
| pub mem_eviction_priority: MemEvictionPriority, |
| pub nodep: bool, |
| pub channel_mask: ChannelMask, |
| } |
| |
| impl DisplayOp for OpTld { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "tld{}{}{}", self.dim, self.lod_mode, self.offset_mode)?; |
| if self.is_ms { |
| write!(f, ".ms")?; |
| } |
| write!(f, "{}", self.mem_eviction_priority)?; |
| if self.nodep { |
| write!(f, ".nodep")?; |
| } |
| write!(f, "{}", self.channel_mask)?; |
| write!(f, " {} {} {}", self.tex, self.srcs[0], self.srcs[1]) |
| } |
| } |
| impl_display_for_op!(OpTld); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpTld4 { |
| pub dsts: [Dst; 2], |
| pub fault: Dst, |
| |
| pub tex: TexRef, |
| |
| #[src_type(SSA)] |
| pub srcs: [Src; 2], |
| |
| pub dim: TexDim, |
| pub comp: u8, |
| pub offset_mode: TexOffsetMode, |
| pub z_cmpr: bool, |
| pub mem_eviction_priority: MemEvictionPriority, |
| pub nodep: bool, |
| pub channel_mask: ChannelMask, |
| } |
| |
| impl DisplayOp for OpTld4 { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "tld4.g{}{}", self.dim, self.offset_mode)?; |
| if self.z_cmpr { |
| write!(f, ".dc")?; |
| } |
| write!(f, "{}", self.mem_eviction_priority)?; |
| if self.nodep { |
| write!(f, ".nodep")?; |
| } |
| write!(f, "{}", self.channel_mask)?; |
| write!(f, " {} {} {}", self.tex, self.srcs[0], self.srcs[1]) |
| } |
| } |
| impl_display_for_op!(OpTld4); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpTmml { |
| pub dsts: [Dst; 2], |
| |
| pub tex: TexRef, |
| |
| #[src_type(SSA)] |
| pub srcs: [Src; 2], |
| |
| pub dim: TexDim, |
| pub deriv_mode: TexDerivMode, |
| pub nodep: bool, |
| pub channel_mask: ChannelMask, |
| } |
| |
| impl DisplayOp for OpTmml { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "tmml.lod{}{}", self.dim, self.deriv_mode)?; |
| if self.nodep { |
| write!(f, ".nodep")?; |
| } |
| write!(f, "{}", self.channel_mask)?; |
| write!(f, " {} {} {}", self.tex, self.srcs[0], self.srcs[1]) |
| } |
| } |
| impl_display_for_op!(OpTmml); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpTxd { |
| pub dsts: [Dst; 2], |
| pub fault: Dst, |
| |
| pub tex: TexRef, |
| |
| #[src_type(SSA)] |
| pub srcs: [Src; 2], |
| |
| pub dim: TexDim, |
| pub offset_mode: TexOffsetMode, |
| pub mem_eviction_priority: MemEvictionPriority, |
| pub nodep: bool, |
| pub channel_mask: ChannelMask, |
| } |
| |
| impl DisplayOp for OpTxd { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "txd{}{}{}", |
| self.dim, self.offset_mode, self.mem_eviction_priority |
| )?; |
| if self.nodep { |
| write!(f, ".nodep")?; |
| } |
| write!(f, "{}", self.channel_mask)?; |
| write!(f, " {} {} {}", self.tex, self.srcs[0], self.srcs[1]) |
| } |
| } |
| impl_display_for_op!(OpTxd); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpTxq { |
| pub dsts: [Dst; 2], |
| |
| pub tex: TexRef, |
| |
| #[src_type(SSA)] |
| pub src: Src, |
| |
| pub query: TexQuery, |
| pub nodep: bool, |
| pub channel_mask: ChannelMask, |
| } |
| |
| impl DisplayOp for OpTxq { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "txq")?; |
| if self.nodep { |
| write!(f, ".nodep")?; |
| } |
| write!(f, "{}", self.channel_mask)?; |
| write!(f, " {} {} {}", self.tex, self.src, self.query) |
| } |
| } |
| impl_display_for_op!(OpTxq); |
| |
| #[allow(dead_code)] |
| #[derive(Clone, Copy, Eq, PartialEq)] |
| pub enum ImageAccess { |
| Binary(MemType), |
| Formatted(ChannelMask), |
| } |
| |
| impl fmt::Display for ImageAccess { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| ImageAccess::Binary(mem_type) => write!(f, ".b{mem_type}"), |
| ImageAccess::Formatted(mask) => write!(f, ".p{mask}"), |
| } |
| } |
| } |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpSuLd { |
| pub dst: Dst, |
| pub fault: Dst, |
| |
| pub image_access: ImageAccess, |
| pub image_dim: ImageDim, |
| pub mem_order: MemOrder, |
| pub mem_eviction_priority: MemEvictionPriority, |
| |
| #[src_type(SSA)] |
| pub handle: Src, |
| |
| #[src_type(SSA)] |
| pub coord: Src, |
| } |
| |
| impl DisplayOp for OpSuLd { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "suld{}{}{}{} [{}] {}", |
| self.image_access, |
| self.image_dim, |
| self.mem_order, |
| self.mem_eviction_priority, |
| self.coord, |
| self.handle, |
| ) |
| } |
| } |
| impl_display_for_op!(OpSuLd); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpSuSt { |
| pub image_access: ImageAccess, |
| pub image_dim: ImageDim, |
| pub mem_order: MemOrder, |
| pub mem_eviction_priority: MemEvictionPriority, |
| |
| #[src_type(SSA)] |
| pub handle: Src, |
| |
| #[src_type(SSA)] |
| pub coord: Src, |
| |
| #[src_type(SSA)] |
| pub data: Src, |
| } |
| |
| impl DisplayOp for OpSuSt { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "sust{}{}{}{} [{}] {} {}", |
| self.image_access, |
| self.image_dim, |
| self.mem_order, |
| self.mem_eviction_priority, |
| self.coord, |
| self.data, |
| self.handle, |
| ) |
| } |
| } |
| impl_display_for_op!(OpSuSt); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpSuAtom { |
| pub dst: Dst, |
| pub fault: Dst, |
| |
| pub image_dim: ImageDim, |
| |
| pub atom_op: AtomOp, |
| pub atom_type: AtomType, |
| |
| pub mem_order: MemOrder, |
| pub mem_eviction_priority: MemEvictionPriority, |
| |
| #[src_type(SSA)] |
| pub handle: Src, |
| |
| #[src_type(SSA)] |
| pub coord: Src, |
| |
| #[src_type(SSA)] |
| pub data: Src, |
| } |
| |
| impl DisplayOp for OpSuAtom { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "suatom.p{}{}{}{}{} [{}] {} {}", |
| self.image_dim, |
| self.atom_op, |
| self.atom_type, |
| self.mem_order, |
| self.mem_eviction_priority, |
| self.coord, |
| self.data, |
| self.handle, |
| ) |
| } |
| } |
| impl_display_for_op!(OpSuAtom); |
| |
| #[derive(Clone, Copy)] |
| pub enum SuClampMode { |
| StoredInDescriptor, |
| PitchLinear, |
| BlockLinear, |
| } |
| |
| impl fmt::Display for SuClampMode { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let s = match self { |
| SuClampMode::StoredInDescriptor => ".sd", |
| SuClampMode::PitchLinear => ".pl", |
| SuClampMode::BlockLinear => ".bl", |
| }; |
| write!(f, "{}", s) |
| } |
| } |
| |
| #[derive(Clone, Copy)] |
| pub enum SuClampRound { |
| R1, |
| R2, |
| R4, |
| R8, |
| R16, |
| } |
| |
| impl SuClampRound { |
| pub fn to_int(&self) -> u8 { |
| match self { |
| SuClampRound::R1 => 1, |
| SuClampRound::R2 => 2, |
| SuClampRound::R4 => 4, |
| SuClampRound::R8 => 8, |
| SuClampRound::R16 => 16, |
| } |
| } |
| |
| #[allow(dead_code)] |
| pub fn to_mask(&self) -> u32 { |
| !(self.to_int() as u32 - 1) |
| } |
| } |
| |
| impl fmt::Display for SuClampRound { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, ".r{}", self.to_int()) |
| } |
| } |
| |
| /// Kepler only |
| /// Surface Clamp |
| /// |
| /// Can clamp coordinates of surface operations in a 0..=clamp inclusive |
| /// range. It also computes other information useful to compute the |
| /// real address of an element within an image for both block-lienar and |
| /// pitch-linear layouts. We can also reduce this operation to a "stupid" |
| /// inclusive clamp by setting modifier Mode=PitchLinear and is_2d=false |
| /// this will not compute any extra operations and is useful to clamp array |
| /// indexes. |
| /// |
| /// Since the shader code does not know if an image layout is block-linear |
| /// or pitch-linear, this opcode must be able to do both, the operation |
| /// is then selected by the "clamp" bitfield, usually read from a descriptor. |
| /// In block-linear mode we divide the bits that will compute the higher |
| /// part and the lower part. |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice, Clone)] |
| pub struct OpSuClamp { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| #[dst_type(Pred)] |
| pub out_of_bounds: Dst, |
| |
| /// This modifier specifies if we use pitch-linear or block-linear |
| /// calculations, another option is to support both and read the actual |
| /// format from the clamp (shader code doesn't always know if an image |
| /// layout). |
| /// When mode=pitch_linear and is_2d=false the suclamp op enters a |
| /// simpler "plain" mode where it only performs clamping and the output |
| /// register doesn't contain any information bits about pitch-linear or |
| /// block-linear calculations |
| pub mode: SuClampMode, |
| /// Strangely enough, "round" just rounds the clamp, not the source |
| /// this does not help at all with clamping coordinates. |
| /// It could be useful when clamping raw addresses of a multi-byte read. |
| /// ex: if we read 4 bytes at once, and the buffer length is 16, |
| /// the bounds will be 15 (they are inclusive), but if we read |
| /// at address 15 we would read bytes 15..19, so we are out of range. |
| /// if we clamp tthe bounds to R4 the effective bound becomes 12 |
| /// so the read will be performed from 12..16, remaining in bounds. |
| pub round: SuClampRound, |
| pub is_s32: bool, |
| pub is_2d: bool, |
| |
| #[src_type(GPR)] |
| pub coords: Src, |
| |
| /// Packed parameter containing both bounds (inclusive) |
| /// and other information (explained in more details in Foldable): |
| /// 0..20: bound (inclusive) |
| /// 21: pitch_linear (used if mode == StoredInDescriptor) |
| /// 22..26: coord shl |
| /// 26..29: coord shr |
| /// 29..32: n. of tiles |
| #[src_type(ALU)] |
| pub params: Src, |
| /// Added to the coords, it's only an i6 |
| pub imm: i8, |
| } |
| |
| impl Foldable for OpSuClamp { |
| fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let src = f.get_u32_src(self, &self.coords); |
| let params = f.get_u32_src(self, &self.params); |
| let imm = self.imm; // i6 |
| |
| let src = if self.is_s32 { |
| (src as i32) as i64 |
| } else { |
| src as i64 |
| }; |
| let src = src + (imm as i64); |
| |
| let params_bv = BitView::new(¶ms); |
| let pitch_linear = match self.mode { |
| SuClampMode::StoredInDescriptor => params_bv.get_bit(21), |
| SuClampMode::PitchLinear => true, |
| SuClampMode::BlockLinear => false, |
| }; |
| |
| let bounds = if pitch_linear && !self.is_2d { |
| params |
| } else { |
| params_bv.get_bit_range_u64(0..20) as u32 |
| }; |
| |
| let bounds = bounds & self.round.to_mask(); |
| let (is_oob, clamped) = if src < 0 { |
| (true, 0) |
| } else if src > (bounds as i64) { |
| (true, bounds) |
| } else { |
| (false, src as u32) |
| }; |
| |
| let mut out = 0u32; |
| let mut bv = BitMutView::new(&mut out); |
| if pitch_linear { |
| if !self.is_2d { |
| // simple clamp mode, NO BITFIELD |
| bv.set_field(0..32, clamped); |
| } else { |
| // Real, pitch_linear mode |
| bv.set_field(0..20, clamped & 0xfffff); |
| |
| // Pass through el_size_log2 |
| bv.set_field(27..30, params_bv.get_bit_range_u64(26..29)); |
| bv.set_bit(30, true); // pitch_linear=true |
| bv.set_bit(31, is_oob); |
| } |
| } else { |
| // Block linear |
| |
| // Number of bits to discard for GoB coordinates |
| let shr_a = params_bv.get_bit_range_u64(22..26) as u8; |
| // Block coords |
| bv.set_field(0..16, (clamped >> shr_a) & 0xffff); |
| |
| // Shift applied to coords, always zero except for x. |
| // (for coord x=1 and format R32, we want to access byte 4) |
| // e.g. R8 -> 0, R32 -> 2, 128 -> 4 |
| let el_size_log2 = params_bv.get_bit_range_u64(26..29) as u8; |
| // Coord inside GoB (element space) |
| bv.set_field(16..24, (clamped << el_size_log2) & 0xff); |
| |
| // Useful later to compute gob-space coords. |
| let n_tiles = params_bv.get_bit_range_u64(29..32) as u8; |
| bv.set_field(27..30, n_tiles); |
| bv.set_bit(30, false); // pitch_linear=false |
| bv.set_bit(31, is_oob); |
| } |
| f.set_u32_dst(self, &self.dst, out); |
| f.set_pred_dst(self, &self.out_of_bounds, is_oob); |
| } |
| } |
| |
| impl DisplayOp for OpSuClamp { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "suclamp{}", self.mode)?; |
| if !matches!(self.round, SuClampRound::R1) { |
| write!(f, "{}", self.round)?; |
| } |
| if !self.is_s32 { |
| write!(f, ".u32")?; |
| } |
| if !self.is_2d { |
| write!(f, ".1d")?; |
| } |
| |
| write! {f, " {} {} {:x}", self.coords, self.params, self.imm} |
| } |
| } |
| impl_display_for_op!(OpSuClamp); |
| |
| /// Kepler only |
| /// BitField Merge |
| /// |
| /// The resulting bit-field is composed of a high-part 8..32 that is merged |
| /// with the address by sueau, and a lower-part 0..8 that is provided |
| /// directly to suldga/sustga and defines the lower offset of the glonal array. |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice, Clone)] |
| pub struct OpSuBfm { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| #[dst_type(Pred)] |
| pub pdst: Dst, |
| |
| /// x, y, z |
| #[src_type(ALU)] |
| pub srcs: [Src; 3], |
| /// When is_3d=false the third source is ignored, but still used in |
| /// pitch-linear computation. |
| pub is_3d: bool, |
| } |
| |
| impl Foldable for OpSuBfm { |
| fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let x_raw = f.get_u32_src(self, &self.srcs[0]); |
| let y_raw = f.get_u32_src(self, &self.srcs[1]); |
| let z_raw = f.get_u32_src(self, &self.srcs[2]); |
| |
| let x = BitView::new(&x_raw); |
| let y = BitView::new(&y_raw); |
| let z = BitView::new(&z_raw); |
| |
| let mut o_raw = 0u32; |
| let mut o = BitMutView::new(&mut o_raw); |
| |
| let is_pitch_linear_2d = x.get_bit(30) || y.get_bit(30); |
| |
| if !is_pitch_linear_2d { |
| // Copy coordinates inside of GoB space. |
| // They are 6 bits from x and 3 from y (GoB is 64x8 bytes). |
| // Bits from 0..8 are ignored by sueau and are used directly |
| // by suldga/sustga. |
| // Bit 9 will become the first bit of the higher part in |
| // sueau. |
| o.set_bit_range_u64(0..4, x.get_bit_range_u64(16..20)); |
| |
| // Address calculation inside of GoB should virtually be |
| // y * 64 + x * element_size (each row is linear). |
| // So why are those bits swizzled like so? |
| // I have no idea, but these are correct even for atomics |
| // that accept real addresses. |
| o.set_bit(4, y.get_bit(16)); |
| o.set_bit(5, y.get_bit(17)); |
| o.set_bit(6, x.get_bit(20)); |
| o.set_bit(7, y.get_bit(18)); |
| |
| o.set_bit(8, x.get_bit(21)); |
| // 9..11: 0 |
| |
| // -------------- Tiles -------------- |
| // Number of tiles log2 |
| let ntx = x.get_bit_range_u64(27..30) & 0x1; |
| let nty = y.get_bit_range_u64(27..30); |
| let ntz = z.get_bit_range_u64(27..30); |
| let ntz = ntz * (self.is_3d as u64); // z is ignored if is_3d=false |
| |
| // Computes how many bits to dedicate to GoB coords inside |
| // a block |
| o.set_field(12..16, ntx + nty + ntz); |
| |
| // Coords in gob_space. |
| // Remove 6 bits from x and 3 bits from y, those are used |
| // as element coords in GoB space. |
| let a = x.get_bit_range_u64(22..24); // 1100_0000 |
| let b = y.get_bit_range_u64(19..24); // 1111_1000 |
| let c = z.get_bit_range_u64(16..24); // 1111_1111 |
| |
| // nt* indicates how many bits to consider (max 5) |
| let a = a & ((1 << ntx) - 1); |
| let b = b & ((1 << nty.min(5)) - 1); |
| let c = c & ((1 << ntz.min(5)) - 1); |
| |
| // Compute gob offset |
| // We can just or together at certain offsets because |
| // Tiles are always powers of two in each direction. |
| // z || y || x (LSB) |
| let res = c; |
| let res = (res << nty) | b; |
| let res = (res << ntx) | a; |
| let mask = match ntx { |
| 0 => 0x3ff, |
| _ => 0x7ff, |
| }; |
| |
| // gob coords will be put before the block coords in |
| // sueau. |
| o.set_field(16..27, res & mask); |
| } else { |
| let d = z.get_bit_range_u64(0..8); |
| let el_size_log2 = x.get_bit_range_u64(27..30); |
| o.set_field(0..8, (d << el_size_log2) & 0xff); |
| // 9..11: 0 |
| o.set_field(12..15, el_size_log2); |
| } |
| |
| o.set_bit(11, is_pitch_linear_2d); |
| |
| let is_oob = |
| x.get_bit(31) || y.get_bit(31) || (z.get_bit(31) && self.is_3d); |
| f.set_u32_dst(self, &self.dst, o_raw); |
| f.set_pred_dst(self, &self.pdst, is_oob); |
| } |
| } |
| |
| impl DisplayOp for OpSuBfm { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "subfm")?; |
| |
| if self.is_3d { |
| write!(f, ".3d")?; |
| } |
| |
| write!(f, " {} {} {}", self.srcs[0], self.srcs[1], self.srcs[2]) |
| } |
| } |
| impl_display_for_op!(OpSuBfm); |
| |
| /// Kepler only |
| /// Used to compute the higher 32 bits of image address using |
| /// the merged bitfield and the block coordinates (offset). |
| /// It can switch to a pitch_linear mode (bit 11 of bit-field). |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice, Clone)] |
| pub struct OpSuEau { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| /// offset is computed from the block coordinates. |
| /// it's ok to add it directly to the address since they are both |
| /// "aligned" to 64 (the first 8 bits are removed from both) |
| #[src_type(GPR)] |
| pub off: Src, |
| |
| /// 8.. 9: offset, last bit |
| /// 11..12: pitch_linear: when enabled the bf-offset is ignored and |
| /// the off_shl is subtracted by 8 |
| /// 12..16: off_shl, shifts left the offset by off_shl + 1 |
| /// 16..27: 11-bit offset, when joined with the 1-bit offset completes the |
| /// 12-bit offset ORed to the src offset after shifting |
| /// (unless pitch_linear) |
| #[src_type(ALU)] |
| pub bit_field: Src, |
| |
| #[src_type(GPR)] |
| pub addr: Src, |
| } |
| |
| impl Foldable for OpSuEau { |
| fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let off_raw = f.get_u32_src(self, &self.off); |
| let bf_raw = f.get_u32_src(self, &self.bit_field); |
| let addr = f.get_u32_src(self, &self.addr); |
| |
| let bf = BitView::new(&bf_raw); |
| |
| let off1 = bf.get_bit_range_u64(8..9) as u32; |
| let is_pitch_linear = bf.get_bit(11); |
| let off_shift = bf.get_bit_range_u64(12..16) as u32; |
| let offs = bf.get_bit_range_u64(16..27) as u32; |
| |
| let res = if !is_pitch_linear { |
| // Block linear |
| // off_raw are the block coordinates |
| // to those we add gob coordinates from the merged bitfield |
| // and the MSB of in-gob coordinates. |
| let omul = off_shift + 1; |
| let real_off = (off_raw << omul) | (offs << 1) | off1; |
| addr.wrapping_add(real_off & 0x7ff_ffff) |
| } else { |
| // Add the high part of the coordinates to addr |
| // off << (omul - 8) |
| // but for negative values do a shr instead. |
| // In fact, off_shift will always be < 8 because pitch_linear |
| // subfm only assigns bits 12..15, so this is always a shr |
| let shl_amount = off_shift as i32 - 8; |
| let off = if shl_amount < 0 { |
| off_raw >> (-shl_amount as u32) |
| } else { |
| off_raw << (shl_amount as u32) |
| }; |
| addr.wrapping_add(off & 0xff_ffff) |
| }; |
| f.set_u32_dst(self, &self.dst, res); |
| } |
| } |
| |
| impl DisplayOp for OpSuEau { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write! {f, "sueau {} {} {}", self.off, self.bit_field, self.addr} |
| } |
| } |
| impl_display_for_op!(OpSuEau); |
| |
| #[derive(Copy, Clone, Debug)] |
| pub enum IMadSpSrcType { |
| U32, |
| U24, |
| U16Hi, |
| U16Lo, |
| S32, |
| S24, |
| S16Hi, |
| S16Lo, |
| } |
| |
| impl IMadSpSrcType { |
| pub fn unsigned(self) -> IMadSpSrcType { |
| use IMadSpSrcType::*; |
| match self { |
| S32 => U32, |
| S24 => U24, |
| S16Hi => U16Hi, |
| S16Lo => U16Lo, |
| x => x, |
| } |
| } |
| |
| #[allow(dead_code)] // Used in hw_tests |
| pub fn with_sign(self, sign: bool) -> Self { |
| use IMadSpSrcType::*; |
| if !sign { |
| return self.unsigned(); |
| } |
| match self { |
| U32 => S32, |
| U24 => S24, |
| U16Hi => S16Hi, |
| U16Lo => S16Lo, |
| x => x, |
| } |
| } |
| |
| pub fn sign(self) -> bool { |
| use IMadSpSrcType::*; |
| match self { |
| U32 | U24 | U16Hi | U16Lo => false, |
| S32 | S24 | S16Hi | S16Lo => true, |
| } |
| } |
| |
| #[allow(dead_code)] |
| fn cast(&self, v: u32) -> i64 { |
| use IMadSpSrcType::*; |
| match self { |
| U32 => v as i64, |
| U24 => (v & 0x00ff_ffff) as i64, |
| U16Lo => (v as u16) as i64, |
| U16Hi => (v >> 16) as i64, |
| S32 => (v as i32) as i64, |
| S24 => (((v as i32) << 8) >> 8) as i64, // Sign extend |
| S16Lo => (v as i16) as i64, |
| S16Hi => ((v >> 16) as i16) as i64, |
| } |
| } |
| } |
| |
| impl fmt::Display for IMadSpSrcType { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let sign = if self.sign() { ".s" } else { ".u" }; |
| let width = match self.unsigned() { |
| IMadSpSrcType::U32 => "32", |
| IMadSpSrcType::U24 => "24", |
| IMadSpSrcType::U16Lo => "16h0", |
| IMadSpSrcType::U16Hi => "16h1", |
| _ => unreachable!(), |
| }; |
| write!(f, "{}{}", sign, width) |
| } |
| } |
| |
| #[derive(Clone, Copy, Debug)] |
| pub enum IMadSpMode { |
| Explicit([IMadSpSrcType; 3]), |
| // Parameters are loaded from src1 bits 26..32 |
| FromSrc1, |
| } |
| |
| impl fmt::Display for IMadSpMode { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| IMadSpMode::Explicit([a, b, c]) => write!(f, "{a}{b}{c}"), |
| IMadSpMode::FromSrc1 => write!(f, ".sd"), |
| } |
| } |
| } |
| |
| /// Kepler only |
| /// Extracted Integer Multiply and Add. |
| /// It does the same operation as an imad op, but it can extract the |
| /// sources from a subset of the register (only 32, 24 or 16 bits). |
| /// It can also do a "load parameters" mode where the modifiers are |
| /// loaded from the higher bits in src2 (check Foldable impl for details). |
| /// Limits: src1 can never be U32 or U16Hi, |
| /// src2 can never be U16Hi |
| /// src2 signedness is tied to src1 and src0 signedness, |
| /// if either is signed, src2 must be signed too. |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice, Clone)] |
| pub struct OpIMadSp { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[src_type(ALU)] |
| pub srcs: [Src; 3], |
| |
| pub mode: IMadSpMode, |
| } |
| |
| impl Foldable for OpIMadSp { |
| fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { |
| let src0 = f.get_u32_src(self, &self.srcs[0]); |
| let src1 = f.get_u32_src(self, &self.srcs[1]); |
| let src2 = f.get_u32_src(self, &self.srcs[2]); |
| |
| let (src_type0, src_type1, src_type2) = match self.mode { |
| IMadSpMode::Explicit([t0, t1, t2]) => (t0, t1, t2), |
| IMadSpMode::FromSrc1 => { |
| let params = BitView::new(&src1); |
| |
| let st2 = params.get_bit_range_u64(26..28) as usize; |
| let st1 = params.get_bit_range_u64(28..30) as usize; |
| let st0 = params.get_bit_range_u64(30..32) as usize; |
| |
| use IMadSpSrcType::*; |
| let types0 = [U32, U24, U16Lo, U16Hi]; |
| let types1 = [U16Lo, U24, U16Lo, U24]; |
| let types2 = [U32, U24, U16Lo, U32]; |
| |
| ( |
| types0[st0].unsigned(), |
| types1[st1].unsigned(), |
| types2[st2].unsigned(), |
| ) |
| } |
| }; |
| |
| let src0 = src_type0.cast(src0); |
| let src1 = src_type1.cast(src1); |
| let src2 = src_type2.cast(src2); |
| |
| f.set_u32_dst(self, &self.dst, (src0 * src1 + src2) as u32); |
| } |
| } |
| |
| impl DisplayOp for OpIMadSp { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "imadsp{} {} {} {}", |
| self.mode, self.srcs[0], self.srcs[1], self.srcs[2] |
| ) |
| } |
| } |
| impl_display_for_op!(OpIMadSp); |
| |
| /// In SuGa ops, the address is always specified in two parts, the higher |
| /// part contains the base address without the lower 8 bits (base_addr >> 8), |
| /// while the lower part might contain either the missing 8 bits (U8) or |
| /// a full 32-bit offset that must not be shifted (U32). |
| /// |
| /// In short: |
| /// U8 : real_address = (addr_hi << 8) + (addr_lo & 0xFF) |
| /// U32: real_address = (addr_hi << 8) + addr_lo |
| /// The signed variants do the same but with sign extension probably |
| #[derive(Clone, Copy)] |
| pub enum SuGaOffsetMode { |
| U32, |
| S32, |
| U8, |
| S8, |
| } |
| |
| /// Kepler only |
| /// Load a pixel from an image, takes the pixel address and format as an |
| /// argument. Since the image coordinates are not present, the instruction |
| /// also needs an `out_of_bounds` predicate, when true it always load (0, 0, 0, 1) |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpSuLdGa { |
| pub dst: Dst, |
| |
| pub mem_type: MemType, |
| pub offset_mode: SuGaOffsetMode, |
| pub cache_op: LdCacheOp, |
| |
| /// Format for the loaded data, passed directly from the descriptor. |
| #[src_type(GPR)] |
| pub format: Src, |
| |
| /// This is not an address, but it's two registers that contain |
| /// [addr >> 8, addr & 0xff]. |
| /// This works because addr >> 8 is 32-bits (GOB-aligned) and the |
| /// rest 8-bits are extracted by the bit-field |
| /// It's useful since in block-linear mode the lower bits and the higher |
| /// bits are computed in different ways. |
| #[src_type(SSA)] |
| pub addr: Src, |
| |
| #[src_type(Pred)] |
| pub out_of_bounds: Src, |
| } |
| |
| impl DisplayOp for OpSuLdGa { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "suldga{}{} [{}] {} {}", |
| self.mem_type, |
| self.cache_op, |
| self.addr, |
| self.format, |
| self.out_of_bounds |
| ) |
| } |
| } |
| impl_display_for_op!(OpSuLdGa); |
| |
| /// Kepler only |
| /// Store a pixel in an image, takes the pixel address and format as an |
| /// argument. Since the image coordinates are not present, the instruction |
| /// also needs an `out_of_bounds` predicate, when true, stores are ingored |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpSuStGa { |
| pub image_access: ImageAccess, |
| pub offset_mode: SuGaOffsetMode, |
| pub cache_op: StCacheOp, |
| |
| #[src_type(GPR)] |
| pub format: Src, |
| |
| #[src_type(SSA)] |
| pub addr: Src, |
| |
| #[src_type(SSA)] |
| pub data: Src, |
| |
| #[src_type(Pred)] |
| pub out_of_bounds: Src, |
| } |
| |
| impl DisplayOp for OpSuStGa { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "sustga{}{} [{}] {} {} {}", |
| self.image_access, |
| self.cache_op, |
| self.addr, |
| self.format, |
| self.data, |
| self.out_of_bounds, |
| ) |
| } |
| } |
| impl_display_for_op!(OpSuStGa); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpLd { |
| pub dst: Dst, |
| |
| #[src_type(GPR)] |
| pub addr: Src, |
| |
| pub offset: i32, |
| pub access: MemAccess, |
| } |
| |
| impl DisplayOp for OpLd { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "ld{} [{}", self.access, self.addr)?; |
| if self.offset > 0 { |
| write!(f, "+{:#x}", self.offset)?; |
| } |
| write!(f, "]") |
| } |
| } |
| impl_display_for_op!(OpLd); |
| |
| #[allow(dead_code)] |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub enum LdcMode { |
| Indexed, |
| IndexedLinear, |
| IndexedSegmented, |
| IndexedSegmentedLinear, |
| } |
| |
| impl fmt::Display for LdcMode { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| LdcMode::Indexed => Ok(()), |
| LdcMode::IndexedLinear => write!(f, ".il"), |
| LdcMode::IndexedSegmented => write!(f, ".is"), |
| LdcMode::IndexedSegmentedLinear => write!(f, ".isl"), |
| } |
| } |
| } |
| |
| #[repr(C)] |
| #[derive(Clone, SrcsAsSlice, DstsAsSlice)] |
| pub struct OpLdc { |
| pub dst: Dst, |
| |
| #[src_type(ALU)] |
| pub cb: Src, |
| |
| #[src_type(GPR)] |
| pub offset: Src, |
| |
| pub mode: LdcMode, |
| pub mem_type: MemType, |
| } |
| |
| impl DisplayOp for OpLdc { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let SrcRef::CBuf(cb) = &self.cb.src_ref else { |
| panic!("Not a cbuf"); |
| }; |
| write!(f, "ldc{}{} {}[", self.mode, self.mem_type, cb.buf)?; |
| if self.offset.is_zero() { |
| write!(f, "+{:#x}", cb.offset)?; |
| } else if cb.offset == 0 { |
| write!(f, "{}", self.offset)?; |
| } else { |
| write!(f, "{}+{:#x}", self.offset, cb.offset)?; |
| } |
| write!(f, "]") |
| } |
| } |
| impl_display_for_op!(OpLdc); |
| |
| #[derive(Clone, Copy, Eq, PartialEq)] |
| #[allow(dead_code)] |
| pub enum LdsmSize { |
| M8N8, |
| MT8N8, |
| } |
| |
| impl fmt::Display for LdsmSize { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| LdsmSize::M8N8 => write!(f, "m8n8"), |
| LdsmSize::MT8N8 => write!(f, "m8n8.trans"), |
| } |
| } |
| } |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpLdsm { |
| #[dst_type(Vec)] |
| pub dst: Dst, |
| |
| pub mat_size: LdsmSize, |
| pub mat_count: u8, |
| |
| #[src_type(SSA)] |
| pub addr: Src, |
| |
| pub offset: i32, |
| } |
| |
| impl DisplayOp for OpLdsm { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "ldsm.16.{}.x{} [{}", |
| self.mat_size, self.mat_count, self.addr, |
| )?; |
| if self.offset > 0 { |
| write!(f, "+{:#x}", self.offset)?; |
| } |
| write!(f, "]") |
| } |
| } |
| |
| impl_display_for_op!(OpLdsm); |
| |
| /// Used for Kepler to implement shared atomics. |
| /// In addition to the load, it tries to lock the address, |
| /// Kepler hardware has (1024?) hardware mutex locks. |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpLdSharedLock { |
| pub dst: Dst, |
| #[dst_type(Pred)] |
| pub locked: Dst, |
| |
| #[src_type(GPR)] |
| pub addr: Src, |
| |
| pub offset: i32, |
| pub mem_type: MemType, |
| } |
| |
| impl DisplayOp for OpLdSharedLock { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "ldslk{} [{}", self.mem_type, self.addr)?; |
| if self.offset > 0 { |
| write!(f, "+{:#x}", self.offset)?; |
| } |
| write!(f, "]") |
| } |
| } |
| impl_display_for_op!(OpLdSharedLock); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpSt { |
| #[src_type(GPR)] |
| pub addr: Src, |
| |
| #[src_type(SSA)] |
| pub data: Src, |
| |
| pub offset: i32, |
| pub access: MemAccess, |
| } |
| |
| impl DisplayOp for OpSt { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "st{} [{}", self.access, self.addr)?; |
| if self.offset > 0 { |
| write!(f, "+{:#x}", self.offset)?; |
| } |
| write!(f, "] {}", self.data) |
| } |
| } |
| impl_display_for_op!(OpSt); |
| |
| /// Used for Kepler to implement shared atomics. |
| /// It checks that the address is still properly locked, performs the |
| /// store operation and unlocks the previously unlocked address. |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpStSCheckUnlock { |
| #[dst_type(Pred)] |
| pub locked: Dst, |
| |
| #[src_type(GPR)] |
| pub addr: Src, |
| #[src_type(SSA)] |
| pub data: Src, |
| |
| pub offset: i32, |
| pub mem_type: MemType, |
| } |
| |
| impl DisplayOp for OpStSCheckUnlock { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "stscul{} [{}", self.mem_type, self.addr)?; |
| if self.offset > 0 { |
| write!(f, "+{:#x}", self.offset)?; |
| } |
| write!(f, "] {}", self.data) |
| } |
| } |
| impl_display_for_op!(OpStSCheckUnlock); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpAtom { |
| pub dst: Dst, |
| |
| #[src_type(GPR)] |
| pub addr: Src, |
| |
| #[src_type(GPR)] |
| pub cmpr: Src, |
| |
| #[src_type(SSA)] |
| pub data: Src, |
| |
| pub atom_op: AtomOp, |
| pub atom_type: AtomType, |
| |
| pub addr_offset: i32, |
| |
| pub mem_space: MemSpace, |
| pub mem_order: MemOrder, |
| pub mem_eviction_priority: MemEvictionPriority, |
| } |
| |
| impl DisplayOp for OpAtom { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "atom{}{}{}{}{}", |
| self.atom_op, |
| self.atom_type, |
| self.mem_space, |
| self.mem_order, |
| self.mem_eviction_priority, |
| )?; |
| write!(f, " [")?; |
| if !self.addr.is_zero() { |
| write!(f, "{}", self.addr)?; |
| } |
| if self.addr_offset > 0 { |
| if !self.addr.is_zero() { |
| write!(f, "+")?; |
| } |
| write!(f, "{:#x}", self.addr_offset)?; |
| } |
| write!(f, "]")?; |
| if self.atom_op == AtomOp::CmpExch(AtomCmpSrc::Separate) { |
| write!(f, " {}", self.cmpr)?; |
| } |
| write!(f, " {}", self.data) |
| } |
| } |
| impl_display_for_op!(OpAtom); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpAL2P { |
| pub dst: Dst, |
| |
| #[src_type(GPR)] |
| pub offset: Src, |
| |
| pub addr: u16, |
| pub comps: u8, |
| pub output: bool, |
| } |
| |
| impl DisplayOp for OpAL2P { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "al2p")?; |
| if self.output { |
| write!(f, ".o")?; |
| } |
| write!(f, " a[{:#x}", self.addr)?; |
| if !self.offset.is_zero() { |
| write!(f, "+{}", self.offset)?; |
| } |
| write!(f, "]") |
| } |
| } |
| impl_display_for_op!(OpAL2P); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpALd { |
| pub dst: Dst, |
| |
| #[src_type(GPR)] |
| pub vtx: Src, |
| |
| #[src_type(GPR)] |
| pub offset: Src, |
| |
| pub addr: u16, |
| pub comps: u8, |
| pub patch: bool, |
| pub output: bool, |
| pub phys: bool, |
| } |
| |
| impl DisplayOp for OpALd { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "ald")?; |
| if self.output { |
| write!(f, ".o")?; |
| } |
| if self.patch { |
| write!(f, ".p")?; |
| } |
| if self.phys { |
| write!(f, ".phys")?; |
| } |
| write!(f, " a")?; |
| if !self.vtx.is_zero() { |
| write!(f, "[{}]", self.vtx)?; |
| } |
| write!(f, "[{:#x}", self.addr)?; |
| if !self.offset.is_zero() { |
| write!(f, "+{}", self.offset)?; |
| } |
| write!(f, "]") |
| } |
| } |
| impl_display_for_op!(OpALd); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpASt { |
| #[src_type(GPR)] |
| pub vtx: Src, |
| |
| #[src_type(GPR)] |
| pub offset: Src, |
| |
| #[src_type(SSA)] |
| pub data: Src, |
| |
| pub addr: u16, |
| pub comps: u8, |
| pub patch: bool, |
| pub phys: bool, |
| } |
| |
| impl DisplayOp for OpASt { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "ast")?; |
| if self.patch { |
| write!(f, ".p")?; |
| } |
| if self.phys { |
| write!(f, ".phys")?; |
| } |
| write!(f, " a")?; |
| if !self.vtx.is_zero() { |
| write!(f, "[{}]", self.vtx)?; |
| } |
| write!(f, "[{:#x}", self.addr)?; |
| if !self.offset.is_zero() { |
| write!(f, "+{}", self.offset)?; |
| } |
| write!(f, "] {}", self.data) |
| } |
| } |
| impl_display_for_op!(OpASt); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpIpa { |
| pub dst: Dst, |
| pub addr: u16, |
| pub freq: InterpFreq, |
| pub loc: InterpLoc, |
| pub inv_w: Src, |
| pub offset: Src, |
| } |
| |
| impl DisplayOp for OpIpa { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "ipa{}{} a[{:#x}] {}", |
| self.freq, self.loc, self.addr, self.inv_w |
| )?; |
| if self.loc == InterpLoc::Offset { |
| write!(f, " {}", self.offset)?; |
| } |
| Ok(()) |
| } |
| } |
| impl_display_for_op!(OpIpa); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpLdTram { |
| pub dst: Dst, |
| pub addr: u16, |
| pub use_c: bool, |
| } |
| |
| impl DisplayOp for OpLdTram { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "ldtram")?; |
| if self.use_c { |
| write!(f, ".c")?; |
| } else { |
| write!(f, ".ab")?; |
| } |
| write!(f, " a[{:#x}]", self.addr)?; |
| Ok(()) |
| } |
| } |
| impl_display_for_op!(OpLdTram); |
| |
| #[allow(dead_code)] |
| #[derive(Copy, Clone, Debug)] |
| pub enum CCtlOp { |
| Qry1, // Only available pre-Volta |
| PF1, |
| PF1_5, // Only available pre-Volta |
| PF2, |
| WB, |
| IV, |
| IVAll, |
| RS, |
| RSLB, // Only available pre-Volta |
| IVAllP, // Only available on Volta+ |
| WBAll, // Only available on Volta+ |
| WBAllP, // Only available on Volta+ |
| } |
| |
| impl CCtlOp { |
| pub fn is_all(&self) -> bool { |
| match self { |
| CCtlOp::Qry1 |
| | CCtlOp::PF1 |
| | CCtlOp::PF1_5 |
| | CCtlOp::PF2 |
| | CCtlOp::WB |
| | CCtlOp::IV |
| | CCtlOp::RS |
| | CCtlOp::RSLB => false, |
| CCtlOp::IVAll | CCtlOp::IVAllP | CCtlOp::WBAll | CCtlOp::WBAllP => { |
| true |
| } |
| } |
| } |
| } |
| |
| impl fmt::Display for CCtlOp { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| CCtlOp::Qry1 => write!(f, "qry1"), |
| CCtlOp::PF1 => write!(f, "pf1"), |
| CCtlOp::PF1_5 => write!(f, "pf1.5"), |
| CCtlOp::PF2 => write!(f, "pf2"), |
| CCtlOp::WB => write!(f, "wb"), |
| CCtlOp::IV => write!(f, "iv"), |
| CCtlOp::IVAll => write!(f, "ivall"), |
| CCtlOp::RS => write!(f, "rs"), |
| CCtlOp::RSLB => write!(f, "rslb"), |
| CCtlOp::IVAllP => write!(f, "ivallp"), |
| CCtlOp::WBAll => write!(f, "wball"), |
| CCtlOp::WBAllP => write!(f, "wballp"), |
| } |
| } |
| } |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpCCtl { |
| pub op: CCtlOp, |
| |
| pub mem_space: MemSpace, |
| |
| #[src_type(GPR)] |
| pub addr: Src, |
| |
| pub addr_offset: i32, |
| } |
| |
| impl DisplayOp for OpCCtl { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "cctl{}", self.mem_space)?; |
| if !self.op.is_all() { |
| write!(f, " [{}", self.addr)?; |
| if self.addr_offset > 0 { |
| write!(f, "+{:#x}", self.addr_offset)?; |
| } |
| write!(f, "]")?; |
| } |
| Ok(()) |
| } |
| } |
| impl_display_for_op!(OpCCtl); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpMemBar { |
| pub scope: MemScope, |
| } |
| |
| impl DisplayOp for OpMemBar { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "membar.sc.{}", self.scope) |
| } |
| } |
| impl_display_for_op!(OpMemBar); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpBClear { |
| pub dst: Dst, |
| } |
| |
| impl DisplayOp for OpBClear { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "bclear") |
| } |
| } |
| impl_display_for_op!(OpBClear); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpBMov { |
| pub dst: Dst, |
| pub src: Src, |
| pub clear: bool, |
| } |
| |
| impl DisplayOp for OpBMov { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "bmov.32")?; |
| if self.clear { |
| write!(f, ".clear")?; |
| } |
| write!(f, " {}", self.src) |
| } |
| } |
| impl_display_for_op!(OpBMov); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpBreak { |
| #[dst_type(Bar)] |
| pub bar_out: Dst, |
| |
| #[src_type(Bar)] |
| pub bar_in: Src, |
| |
| #[src_type(Pred)] |
| pub cond: Src, |
| } |
| |
| impl DisplayOp for OpBreak { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "break {} {}", self.bar_in, self.cond) |
| } |
| } |
| impl_display_for_op!(OpBreak); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpBSSy { |
| #[dst_type(Bar)] |
| pub bar_out: Dst, |
| |
| #[src_type(Pred)] |
| pub bar_in: Src, |
| |
| #[src_type(Pred)] |
| pub cond: Src, |
| |
| pub target: Label, |
| } |
| |
| impl DisplayOp for OpBSSy { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "bssy {} {} {}", self.bar_in, self.cond, self.target) |
| } |
| } |
| impl_display_for_op!(OpBSSy); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpBSync { |
| #[src_type(Bar)] |
| pub bar: Src, |
| |
| #[src_type(Pred)] |
| pub cond: Src, |
| } |
| |
| impl DisplayOp for OpBSync { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "bsync {} {}", self.bar, self.cond) |
| } |
| } |
| impl_display_for_op!(OpBSync); |
| |
| /// Takes the branch when the guard predicate and all sources evaluate to true. |
| #[repr(C)] |
| #[derive(Clone, SrcsAsSlice, DstsAsSlice)] |
| pub struct OpBra { |
| pub target: Label, |
| |
| /// Can be a UPred if uniform |
| // TODO: actually .u has another form with an additional UPred input. |
| #[src_type(Pred)] |
| pub cond: Src, |
| } |
| |
| impl DisplayOp for OpBra { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "bra {} {}", self.cond, self.target) |
| } |
| } |
| impl_display_for_op!(OpBra); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpSSy { |
| pub target: Label, |
| } |
| |
| impl DisplayOp for OpSSy { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "ssy {}", self.target) |
| } |
| } |
| impl_display_for_op!(OpSSy); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpSync { |
| pub target: Label, |
| } |
| |
| impl DisplayOp for OpSync { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "sync {}", self.target) |
| } |
| } |
| impl_display_for_op!(OpSync); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpBrk { |
| pub target: Label, |
| } |
| |
| impl DisplayOp for OpBrk { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "brk {}", self.target) |
| } |
| } |
| impl_display_for_op!(OpBrk); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpPBk { |
| pub target: Label, |
| } |
| |
| impl DisplayOp for OpPBk { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "pbk {}", self.target) |
| } |
| } |
| impl_display_for_op!(OpPBk); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpCont { |
| pub target: Label, |
| } |
| |
| impl DisplayOp for OpCont { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "cont {}", self.target) |
| } |
| } |
| impl_display_for_op!(OpCont); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpPCnt { |
| pub target: Label, |
| } |
| |
| impl DisplayOp for OpPCnt { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "pcnt {}", self.target) |
| } |
| } |
| impl_display_for_op!(OpPCnt); |
| |
| #[repr(C)] |
| #[derive(Clone, SrcsAsSlice, DstsAsSlice)] |
| pub struct OpExit {} |
| |
| impl DisplayOp for OpExit { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "exit") |
| } |
| } |
| impl_display_for_op!(OpExit); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpWarpSync { |
| pub mask: u32, |
| } |
| |
| impl DisplayOp for OpWarpSync { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "warpsync 0x{:x}", self.mask) |
| } |
| } |
| impl_display_for_op!(OpWarpSync); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpBar {} |
| |
| impl DisplayOp for OpBar { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "bar.sync") |
| } |
| } |
| impl_display_for_op!(OpBar); |
| |
| /// Instruction only used on Kepler(A|B). |
| /// Kepler has explicit dependency tracking for texture loads. |
| /// When a texture load is executed, it is put on some kind of FIFO queue |
| /// for later execution. |
| /// Before the results of a texture are used we need to wait on the queue, |
| /// texdepbar waits until the queue has at most `textures_left` elements. |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpTexDepBar { |
| pub textures_left: u8, |
| } |
| |
| impl OpTexDepBar { |
| /// Maximum value of textures_left |
| /// |
| /// The maximum encodable value is 63. However, nvcc starts emitting |
| /// TEXDEPBAR 0x3e as soon as it hits 62 texture instructions. |
| pub const MAX_TEXTURES_LEFT: u8 = 62; |
| } |
| |
| impl DisplayOp for OpTexDepBar { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "texdepbar {}", self.textures_left) |
| } |
| } |
| impl_display_for_op!(OpTexDepBar); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpCS2R { |
| pub dst: Dst, |
| pub idx: u8, |
| } |
| |
| impl DisplayOp for OpCS2R { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "cs2r sr[{:#x}]", self.idx) |
| } |
| } |
| impl_display_for_op!(OpCS2R); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpIsberd { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[src_type(SSA)] |
| pub idx: Src, |
| } |
| |
| impl DisplayOp for OpIsberd { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "isberd [{}]", self.idx) |
| } |
| } |
| impl_display_for_op!(OpIsberd); |
| |
| /// Vertex Index Load |
| /// (Only available in Kepler) |
| /// |
| /// Takes as input the vertex index and loads the vertex address in |
| /// attribute space. |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpViLd { |
| #[dst_type(GPR)] |
| pub dst: Dst, |
| |
| #[src_type(SSA)] |
| pub idx: Src, |
| |
| pub off: i8, |
| } |
| |
| impl DisplayOp for OpViLd { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "vild v[")?; |
| |
| if !self.idx.is_zero() { |
| write!(f, "{}", self.idx)?; |
| if self.off != 0 { |
| write!(f, "{:+}", self.off)?; |
| } |
| } else { |
| write!(f, "{}", self.off)?; |
| } |
| |
| write!(f, "]") |
| } |
| } |
| impl_display_for_op!(OpViLd); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpKill {} |
| |
| impl DisplayOp for OpKill { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "kill") |
| } |
| } |
| impl_display_for_op!(OpKill); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpNop { |
| pub label: Option<Label>, |
| } |
| |
| impl DisplayOp for OpNop { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "nop")?; |
| if let Some(label) = &self.label { |
| write!(f, " {}", label)?; |
| } |
| Ok(()) |
| } |
| } |
| impl_display_for_op!(OpNop); |
| |
| #[allow(dead_code)] |
| pub enum PixVal { |
| MsCount, |
| CovMask, |
| Covered, |
| Offset, |
| CentroidOffset, |
| MyIndex, |
| InnerCoverage, |
| } |
| |
| impl fmt::Display for PixVal { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| PixVal::MsCount => write!(f, ".mscount"), |
| PixVal::CovMask => write!(f, ".covmask"), |
| PixVal::Covered => write!(f, ".covered"), |
| PixVal::Offset => write!(f, ".offset"), |
| PixVal::CentroidOffset => write!(f, ".centroid_offset"), |
| PixVal::MyIndex => write!(f, ".my_index"), |
| PixVal::InnerCoverage => write!(f, ".inner_coverage"), |
| } |
| } |
| } |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpPixLd { |
| pub dst: Dst, |
| pub val: PixVal, |
| } |
| |
| impl DisplayOp for OpPixLd { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "pixld{}", self.val) |
| } |
| } |
| impl_display_for_op!(OpPixLd); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpS2R { |
| pub dst: Dst, |
| pub idx: u8, |
| } |
| |
| impl DisplayOp for OpS2R { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "s2r sr[{:#x}]", self.idx) |
| } |
| } |
| impl_display_for_op!(OpS2R); |
| |
| pub enum VoteOp { |
| Any, |
| All, |
| Eq, |
| } |
| |
| impl fmt::Display for VoteOp { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| VoteOp::Any => write!(f, "any"), |
| VoteOp::All => write!(f, "all"), |
| VoteOp::Eq => write!(f, "eq"), |
| } |
| } |
| } |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpVote { |
| pub op: VoteOp, |
| |
| #[dst_type(GPR)] |
| pub ballot: Dst, |
| |
| #[dst_type(Pred)] |
| pub vote: Dst, |
| |
| #[src_type(Pred)] |
| pub pred: Src, |
| } |
| |
| impl DisplayOp for OpVote { |
| fn fmt_dsts(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| if self.ballot.is_none() && self.vote.is_none() { |
| write!(f, "none") |
| } else { |
| if !self.ballot.is_none() { |
| write!(f, "{}", self.ballot)?; |
| } |
| if !self.vote.is_none() { |
| write!(f, "{}", self.vote)?; |
| } |
| Ok(()) |
| } |
| } |
| |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "vote.{} {}", self.op, self.pred) |
| } |
| } |
| impl_display_for_op!(OpVote); |
| |
| #[allow(dead_code)] |
| #[derive(Copy, Clone)] |
| pub enum MatchOp { |
| All, |
| Any, |
| } |
| |
| impl fmt::Display for MatchOp { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| MatchOp::All => write!(f, ".all"), |
| MatchOp::Any => write!(f, ".any"), |
| } |
| } |
| } |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpMatch { |
| #[dst_type(Pred)] |
| pub pred: Dst, |
| |
| #[dst_type(GPR)] |
| pub mask: Dst, |
| |
| #[src_type(GPR)] |
| pub src: Src, |
| |
| pub op: MatchOp, |
| pub u64: bool, |
| } |
| |
| impl DisplayOp for OpMatch { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let u64_str = if self.u64 { ".u64" } else { "" }; |
| write!(f, "match{}{} {}", self.op, u64_str, self.src) |
| } |
| } |
| impl_display_for_op!(OpMatch); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpUndef { |
| pub dst: Dst, |
| } |
| |
| impl DisplayOp for OpUndef { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "undef {}", self.dst) |
| } |
| } |
| impl_display_for_op!(OpUndef); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpSrcBar { |
| pub src: Src, |
| } |
| |
| impl DisplayOp for OpSrcBar { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "src_bar {}", self.src) |
| } |
| } |
| impl_display_for_op!(OpSrcBar); |
| |
| pub struct VecPair<A, B> { |
| a: Vec<A>, |
| b: Vec<B>, |
| } |
| |
| impl<A, B> VecPair<A, B> { |
| pub fn append(&mut self, other: &mut VecPair<A, B>) { |
| self.a.append(&mut other.a); |
| self.b.append(&mut other.b); |
| } |
| |
| pub fn is_empty(&self) -> bool { |
| debug_assert!(self.a.len() == self.b.len()); |
| self.a.is_empty() |
| } |
| |
| pub fn iter(&self) -> Zip<slice::Iter<'_, A>, slice::Iter<'_, B>> { |
| debug_assert!(self.a.len() == self.b.len()); |
| self.a.iter().zip(self.b.iter()) |
| } |
| |
| pub fn iter_mut( |
| &mut self, |
| ) -> Zip<slice::IterMut<'_, A>, slice::IterMut<'_, B>> { |
| debug_assert!(self.a.len() == self.b.len()); |
| self.a.iter_mut().zip(self.b.iter_mut()) |
| } |
| |
| pub fn len(&self) -> usize { |
| debug_assert!(self.a.len() == self.b.len()); |
| self.a.len() |
| } |
| |
| pub fn new() -> Self { |
| Self { |
| a: Vec::new(), |
| b: Vec::new(), |
| } |
| } |
| |
| pub fn push(&mut self, a: A, b: B) { |
| debug_assert!(self.a.len() == self.b.len()); |
| self.a.push(a); |
| self.b.push(b); |
| } |
| } |
| |
| impl<A: Clone, B: Clone> VecPair<A, B> { |
| pub fn retain(&mut self, mut f: impl FnMut(&A, &B) -> bool) { |
| debug_assert!(self.a.len() == self.b.len()); |
| let len = self.a.len(); |
| let mut i = 0_usize; |
| while i < len { |
| if !f(&self.a[i], &self.b[i]) { |
| break; |
| } |
| i += 1; |
| } |
| |
| let mut new_len = i; |
| |
| // Don't check this one twice. |
| i += 1; |
| |
| while i < len { |
| // This could be more efficient but it's good enough for our |
| // purposes since everything we're storing is small and has a |
| // trivial Drop. |
| if f(&self.a[i], &self.b[i]) { |
| self.a[new_len] = self.a[i].clone(); |
| self.b[new_len] = self.b[i].clone(); |
| new_len += 1; |
| } |
| i += 1; |
| } |
| |
| if new_len < len { |
| self.a.truncate(new_len); |
| self.b.truncate(new_len); |
| } |
| } |
| } |
| |
| mod phi { |
| #[allow(unused_imports)] |
| use crate::ir::{OpPhiDsts, OpPhiSrcs}; |
| use compiler::bitset::IntoBitIndex; |
| use std::fmt; |
| |
| /// A phi node |
| /// |
| /// Phis in NAK are implemented differently from NIR and similar IRs. |
| /// Instead of having a single phi instruction which lives in the successor |
| /// block, each `Phi` represents a single merged 32-bit (or 1-bit for |
| /// predicates) value and we have separate [`OpPhiSrcs`] and [`OpPhiDsts`] |
| /// instructions which map phis to sources and destinations. |
| /// |
| /// One of the problems fundamental to phis is that they really live on the |
| /// edges between blocks. Regardless of where the phi instruction lives in |
| /// the IR data structures, its sources are consumed at the end of the |
| /// predecessor block and its destinations are defined at the start of the |
| /// successor block and all phi sources and destinations get consumed and go |
| /// live simultaneously for any given CFG edge. For a phi that participates |
| /// in a back-edge, this means that the source of the phi may be consumed |
| /// after (in block order) the destination goes live. |
| /// |
| /// In NIR, this has caused no end of headaches. Most passes which need to |
| /// process phis ignore phis when first processing a block and then have a |
| /// special case at the end of each block which walks the successors and |
| /// processes the successor's phis, looking only at the phi sources whose |
| /// predecessor matches the block. This is clunky and often forgotten by |
| /// optimization and lowering pass authors. It's also easy to get missed by |
| /// testing since it only really breaks if you have a phi which participates |
| /// in a back-edge so it often gets found later when something breaks in the |
| /// wild. |
| /// |
| /// To work around this (and also make things a little more Rust-friendly), |
| /// NAK places the instruction which consumes phi sources at the end of the |
| /// predecessor block and the instruction which defines phi destinations at |
| /// the start of the successor block. This structurally eliminates the |
| /// problem that has plagued NIR for years. The cost to this solution is |
| /// that we have to create maps from phis to/from SSA values whenever we |
| /// want to optimize the phis themselves. However, this affects few enough |
| /// passes that the benefits to the rest of the IR are worth the trade-off, |
| /// at least for a back-end compiler. |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub struct Phi { |
| idx: u32, |
| } |
| |
| impl IntoBitIndex for Phi { |
| fn into_bit_index(self) -> usize { |
| self.idx.try_into().unwrap() |
| } |
| } |
| |
| impl fmt::Display for Phi { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "φ{}", self.idx) |
| } |
| } |
| |
| pub struct PhiAllocator { |
| count: u32, |
| } |
| |
| impl PhiAllocator { |
| pub fn new() -> PhiAllocator { |
| PhiAllocator { count: 0 } |
| } |
| |
| pub fn alloc(&mut self) -> Phi { |
| let idx = self.count; |
| self.count = idx + 1; |
| Phi { idx } |
| } |
| } |
| } |
| pub use phi::{Phi, PhiAllocator}; |
| |
| /// An instruction which maps [Phi]s to sources in the predecessor block |
| #[repr(C)] |
| #[derive(DstsAsSlice)] |
| pub struct OpPhiSrcs { |
| pub srcs: VecPair<Phi, Src>, |
| } |
| |
| impl OpPhiSrcs { |
| pub fn new() -> OpPhiSrcs { |
| OpPhiSrcs { |
| srcs: VecPair::new(), |
| } |
| } |
| } |
| |
| impl AsSlice<Src> for OpPhiSrcs { |
| type Attr = SrcType; |
| |
| fn as_slice(&self) -> &[Src] { |
| &self.srcs.b |
| } |
| |
| fn as_mut_slice(&mut self) -> &mut [Src] { |
| &mut self.srcs.b |
| } |
| |
| fn attrs(&self) -> SrcTypeList { |
| SrcTypeList::Uniform(SrcType::GPR) |
| } |
| } |
| |
| impl DisplayOp for OpPhiSrcs { |
| fn fmt_dsts(&self, _f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| Ok(()) |
| } |
| |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "phi_src ")?; |
| for (i, (phi, src)) in self.srcs.iter().enumerate() { |
| if i > 0 { |
| write!(f, ", ")?; |
| } |
| write!(f, "{phi} = {src}")?; |
| } |
| Ok(()) |
| } |
| } |
| impl_display_for_op!(OpPhiSrcs); |
| |
| /// An instruction which maps [Phi]s to destinations in the succeessor block |
| #[repr(C)] |
| #[derive(SrcsAsSlice)] |
| pub struct OpPhiDsts { |
| pub dsts: VecPair<Phi, Dst>, |
| } |
| |
| impl OpPhiDsts { |
| pub fn new() -> OpPhiDsts { |
| OpPhiDsts { |
| dsts: VecPair::new(), |
| } |
| } |
| } |
| |
| impl AsSlice<Dst> for OpPhiDsts { |
| type Attr = DstType; |
| |
| fn as_slice(&self) -> &[Dst] { |
| &self.dsts.b |
| } |
| |
| fn as_mut_slice(&mut self) -> &mut [Dst] { |
| &mut self.dsts.b |
| } |
| |
| fn attrs(&self) -> DstTypeList { |
| DstTypeList::Uniform(DstType::Vec) |
| } |
| } |
| |
| impl DisplayOp for OpPhiDsts { |
| fn fmt_dsts(&self, _f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| Ok(()) |
| } |
| |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "phi_dst ")?; |
| for (i, (phi, dst)) in self.dsts.iter().enumerate() { |
| if i > 0 { |
| write!(f, ", ")?; |
| } |
| write!(f, "{dst} = {phi}")?; |
| } |
| Ok(()) |
| } |
| } |
| impl_display_for_op!(OpPhiDsts); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpCopy { |
| pub dst: Dst, |
| pub src: Src, |
| } |
| |
| impl DisplayOp for OpCopy { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "copy {}", self.src) |
| } |
| } |
| impl_display_for_op!(OpCopy); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| /// Copies a value and pins its destination in the register file |
| pub struct OpPin { |
| pub dst: Dst, |
| #[src_type(SSA)] |
| pub src: Src, |
| } |
| |
| impl DisplayOp for OpPin { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "pin {}", self.src) |
| } |
| } |
| impl_display_for_op!(OpPin); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| /// Copies a pinned value to an unpinned value |
| pub struct OpUnpin { |
| pub dst: Dst, |
| #[src_type(SSA)] |
| pub src: Src, |
| } |
| |
| impl DisplayOp for OpUnpin { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "unpin {}", self.src) |
| } |
| } |
| impl_display_for_op!(OpUnpin); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpSwap { |
| pub dsts: [Dst; 2], |
| pub srcs: [Src; 2], |
| } |
| |
| impl DisplayOp for OpSwap { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "swap {} {}", self.srcs[0], self.srcs[1]) |
| } |
| } |
| impl_display_for_op!(OpSwap); |
| |
| #[repr(C)] |
| pub struct OpParCopy { |
| pub dsts_srcs: VecPair<Dst, Src>, |
| pub tmp: Option<RegRef>, |
| } |
| |
| impl OpParCopy { |
| pub fn new() -> OpParCopy { |
| OpParCopy { |
| dsts_srcs: VecPair::new(), |
| tmp: None, |
| } |
| } |
| |
| pub fn is_empty(&self) -> bool { |
| self.dsts_srcs.is_empty() |
| } |
| |
| pub fn push(&mut self, dst: Dst, src: Src) { |
| self.dsts_srcs.push(dst, src); |
| } |
| } |
| |
| impl AsSlice<Src> for OpParCopy { |
| type Attr = SrcType; |
| |
| fn as_slice(&self) -> &[Src] { |
| &self.dsts_srcs.b |
| } |
| |
| fn as_mut_slice(&mut self) -> &mut [Src] { |
| &mut self.dsts_srcs.b |
| } |
| |
| fn attrs(&self) -> SrcTypeList { |
| SrcTypeList::Uniform(SrcType::GPR) |
| } |
| } |
| |
| impl AsSlice<Dst> for OpParCopy { |
| type Attr = DstType; |
| |
| fn as_slice(&self) -> &[Dst] { |
| &self.dsts_srcs.a |
| } |
| |
| fn as_mut_slice(&mut self) -> &mut [Dst] { |
| &mut self.dsts_srcs.a |
| } |
| |
| fn attrs(&self) -> DstTypeList { |
| DstTypeList::Uniform(DstType::Vec) |
| } |
| } |
| |
| impl DisplayOp for OpParCopy { |
| fn fmt_dsts(&self, _f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| Ok(()) |
| } |
| |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "par_copy")?; |
| for (i, (dst, src)) in self.dsts_srcs.iter().enumerate() { |
| if i > 0 { |
| write!(f, ",")?; |
| } |
| write!(f, " {} = {}", dst, src)?; |
| } |
| Ok(()) |
| } |
| } |
| impl_display_for_op!(OpParCopy); |
| |
| #[repr(C)] |
| #[derive(DstsAsSlice)] |
| pub struct OpRegOut { |
| pub srcs: Vec<Src>, |
| } |
| |
| impl AsSlice<Src> for OpRegOut { |
| type Attr = SrcType; |
| |
| fn as_slice(&self) -> &[Src] { |
| &self.srcs |
| } |
| |
| fn as_mut_slice(&mut self) -> &mut [Src] { |
| &mut self.srcs |
| } |
| |
| fn attrs(&self) -> SrcTypeList { |
| SrcTypeList::Uniform(SrcType::GPR) |
| } |
| } |
| |
| impl DisplayOp for OpRegOut { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "reg_out {{")?; |
| for (i, src) in self.srcs.iter().enumerate() { |
| if i > 0 { |
| write!(f, ",")?; |
| } |
| write!(f, " {}", src)?; |
| } |
| write!(f, " }}") |
| } |
| } |
| impl_display_for_op!(OpRegOut); |
| |
| #[derive(Copy, Clone, Debug, PartialEq)] |
| pub enum OutType { |
| Emit, |
| Cut, |
| EmitThenCut, |
| } |
| |
| impl fmt::Display for OutType { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| OutType::Emit => write!(f, "emit"), |
| OutType::Cut => write!(f, "cut"), |
| OutType::EmitThenCut => write!(f, "emit_then_cut"), |
| } |
| } |
| } |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpOut { |
| pub dst: Dst, |
| |
| #[src_type(SSA)] |
| pub handle: Src, |
| |
| #[src_type(ALU)] |
| pub stream: Src, |
| |
| pub out_type: OutType, |
| } |
| |
| impl DisplayOp for OpOut { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "out.{} {} {}", self.out_type, self.handle, self.stream) |
| } |
| } |
| impl_display_for_op!(OpOut); |
| |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpOutFinal { |
| #[src_type(SSA)] |
| pub handle: Src, |
| } |
| |
| impl DisplayOp for OpOutFinal { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "out.final {{ {} }}", self.handle) |
| } |
| } |
| impl_display_for_op!(OpOutFinal); |
| |
| /// Describes an annotation on an instruction. |
| #[repr(C)] |
| #[derive(SrcsAsSlice, DstsAsSlice)] |
| pub struct OpAnnotate { |
| /// The annotation |
| pub annotation: String, |
| } |
| |
| impl DisplayOp for OpAnnotate { |
| fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "// {}", self.annotation) |
| } |
| } |
| |
| impl fmt::Display for OpAnnotate { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| self.fmt_op(f) |
| } |
| } |
| |
| #[derive(DisplayOp, DstsAsSlice, SrcsAsSlice, FromVariants)] |
| pub enum Op { |
| FAdd(Box<OpFAdd>), |
| FFma(Box<OpFFma>), |
| FMnMx(Box<OpFMnMx>), |
| FMul(Box<OpFMul>), |
| Rro(Box<OpRro>), |
| MuFu(Box<OpMuFu>), |
| FSet(Box<OpFSet>), |
| FSetP(Box<OpFSetP>), |
| FSwzAdd(Box<OpFSwzAdd>), |
| FSwz(Box<OpFSwz>), |
| DAdd(Box<OpDAdd>), |
| DFma(Box<OpDFma>), |
| DMnMx(Box<OpDMnMx>), |
| DMul(Box<OpDMul>), |
| DSetP(Box<OpDSetP>), |
| HAdd2(Box<OpHAdd2>), |
| HFma2(Box<OpHFma2>), |
| HMul2(Box<OpHMul2>), |
| HSet2(Box<OpHSet2>), |
| HSetP2(Box<OpHSetP2>), |
| Imma(Box<OpImma>), |
| Hmma(Box<OpHmma>), |
| Ldsm(Box<OpLdsm>), |
| HMnMx2(Box<OpHMnMx2>), |
| BMsk(Box<OpBMsk>), |
| BRev(Box<OpBRev>), |
| Bfe(Box<OpBfe>), |
| Flo(Box<OpFlo>), |
| IAbs(Box<OpIAbs>), |
| IAdd2(Box<OpIAdd2>), |
| IAdd2X(Box<OpIAdd2X>), |
| IAdd3(Box<OpIAdd3>), |
| IAdd3X(Box<OpIAdd3X>), |
| IDp4(Box<OpIDp4>), |
| IMad(Box<OpIMad>), |
| IMad64(Box<OpIMad64>), |
| IMul(Box<OpIMul>), |
| IMnMx(Box<OpIMnMx>), |
| ISetP(Box<OpISetP>), |
| Lea(Box<OpLea>), |
| LeaX(Box<OpLeaX>), |
| Lop2(Box<OpLop2>), |
| Lop3(Box<OpLop3>), |
| PopC(Box<OpPopC>), |
| Shf(Box<OpShf>), |
| Shl(Box<OpShl>), |
| Shr(Box<OpShr>), |
| F2F(Box<OpF2F>), |
| F2FP(Box<OpF2FP>), |
| F2I(Box<OpF2I>), |
| I2F(Box<OpI2F>), |
| I2I(Box<OpI2I>), |
| FRnd(Box<OpFRnd>), |
| Mov(Box<OpMov>), |
| Prmt(Box<OpPrmt>), |
| Sel(Box<OpSel>), |
| Shfl(Box<OpShfl>), |
| PLop3(Box<OpPLop3>), |
| PSetP(Box<OpPSetP>), |
| R2UR(Box<OpR2UR>), |
| Redux(Box<OpRedux>), |
| Tex(Box<OpTex>), |
| Tld(Box<OpTld>), |
| Tld4(Box<OpTld4>), |
| Tmml(Box<OpTmml>), |
| Txd(Box<OpTxd>), |
| Txq(Box<OpTxq>), |
| SuLd(Box<OpSuLd>), |
| SuSt(Box<OpSuSt>), |
| SuAtom(Box<OpSuAtom>), |
| SuClamp(Box<OpSuClamp>), |
| SuBfm(Box<OpSuBfm>), |
| SuEau(Box<OpSuEau>), |
| IMadSp(Box<OpIMadSp>), |
| SuLdGa(Box<OpSuLdGa>), |
| SuStGa(Box<OpSuStGa>), |
| Ld(Box<OpLd>), |
| Ldc(Box<OpLdc>), |
| LdSharedLock(Box<OpLdSharedLock>), |
| St(Box<OpSt>), |
| StSCheckUnlock(Box<OpStSCheckUnlock>), |
| Atom(Box<OpAtom>), |
| AL2P(Box<OpAL2P>), |
| ALd(Box<OpALd>), |
| ASt(Box<OpASt>), |
| Ipa(Box<OpIpa>), |
| LdTram(Box<OpLdTram>), |
| CCtl(Box<OpCCtl>), |
| MemBar(Box<OpMemBar>), |
| BClear(Box<OpBClear>), |
| BMov(Box<OpBMov>), |
| Break(Box<OpBreak>), |
| BSSy(Box<OpBSSy>), |
| BSync(Box<OpBSync>), |
| Bra(Box<OpBra>), |
| SSy(OpSSy), |
| Sync(OpSync), |
| Brk(OpBrk), |
| PBk(OpPBk), |
| Cont(OpCont), |
| PCnt(OpPCnt), |
| Exit(OpExit), |
| WarpSync(Box<OpWarpSync>), |
| Bar(Box<OpBar>), |
| TexDepBar(Box<OpTexDepBar>), |
| CS2R(Box<OpCS2R>), |
| Isberd(Box<OpIsberd>), |
| ViLd(Box<OpViLd>), |
| Kill(Box<OpKill>), |
| Nop(OpNop), |
| PixLd(Box<OpPixLd>), |
| S2R(Box<OpS2R>), |
| Vote(Box<OpVote>), |
| Match(Box<OpMatch>), |
| Undef(Box<OpUndef>), |
| SrcBar(Box<OpSrcBar>), |
| PhiSrcs(Box<OpPhiSrcs>), |
| PhiDsts(Box<OpPhiDsts>), |
| Copy(Box<OpCopy>), |
| Pin(Box<OpPin>), |
| Unpin(Box<OpUnpin>), |
| Swap(Box<OpSwap>), |
| ParCopy(Box<OpParCopy>), |
| RegOut(Box<OpRegOut>), |
| Out(Box<OpOut>), |
| OutFinal(Box<OpOutFinal>), |
| Annotate(Box<OpAnnotate>), |
| } |
| impl_display_for_op!(Op); |
| |
| #[cfg(target_arch = "x86_64")] |
| const _: () = { |
| debug_assert!(size_of::<Op>() == 16); |
| }; |
| |
| impl Op { |
| pub fn is_branch(&self) -> bool { |
| match self { |
| Op::Bra(_) |
| | Op::Sync(_) |
| | Op::Brk(_) |
| | Op::Cont(_) |
| | Op::Exit(_) => true, |
| _ => false, |
| } |
| } |
| |
| pub fn is_fp64(&self) -> bool { |
| match self { |
| Op::MuFu(op) => matches!(op.op, MuFuOp::Rcp64H | MuFuOp::Rsq64H), |
| Op::DAdd(_) |
| | Op::DFma(_) |
| | Op::DMnMx(_) |
| | Op::DMul(_) |
| | Op::DSetP(_) => true, |
| Op::F2F(op) => op.src_type.bits() == 64 || op.dst_type.bits() == 64, |
| Op::F2I(op) => op.src_type.bits() == 64 || op.dst_type.bits() == 64, |
| Op::I2F(op) => op.src_type.bits() == 64 || op.dst_type.bits() == 64, |
| Op::FRnd(op) => { |
| op.src_type.bits() == 64 || op.dst_type.bits() == 64 |
| } |
| _ => false, |
| } |
| } |
| |
| pub fn has_fixed_latency(&self, sm: u8) -> bool { |
| match self { |
| // Float ALU |
| Op::F2FP(_) |
| | Op::FAdd(_) |
| | Op::FFma(_) |
| | Op::FMnMx(_) |
| | Op::FMul(_) |
| | Op::FSet(_) |
| | Op::FSetP(_) |
| | Op::HAdd2(_) |
| | Op::HFma2(_) |
| | Op::HMul2(_) |
| | Op::HSet2(_) |
| | Op::HSetP2(_) |
| | Op::HMnMx2(_) |
| | Op::FSwz(_) |
| | Op::FSwzAdd(_) => true, |
| |
| // Multi-function unit is variable latency |
| Op::Rro(_) | Op::MuFu(_) => false, |
| |
| // Double-precision float ALU |
| Op::DAdd(_) |
| | Op::DFma(_) |
| | Op::DMnMx(_) |
| | Op::DMul(_) |
| | Op::DSetP(_) => false, |
| |
| // Matrix Multiply Add |
| Op::Imma(_) | Op::Hmma(_) | Op::Ldsm(_) => false, |
| |
| // Integer ALU |
| Op::BRev(_) | Op::Flo(_) | Op::PopC(_) => false, |
| Op::IMad(_) | Op::IMul(_) => sm >= 70, |
| Op::BMsk(_) |
| | Op::IAbs(_) |
| | Op::IAdd2(_) |
| | Op::IAdd2X(_) |
| | Op::IAdd3(_) |
| | Op::IAdd3X(_) |
| | Op::IDp4(_) |
| | Op::IMad64(_) |
| | Op::IMnMx(_) |
| | Op::ISetP(_) |
| | Op::Lea(_) |
| | Op::LeaX(_) |
| | Op::Lop2(_) |
| | Op::Lop3(_) |
| | Op::SuClamp(_) |
| | Op::SuBfm(_) |
| | Op::SuEau(_) |
| | Op::IMadSp(_) |
| | Op::Shf(_) |
| | Op::Shl(_) |
| | Op::Shr(_) |
| | Op::Bfe(_) => true, |
| |
| // Conversions are variable latency?!? |
| Op::F2F(_) | Op::F2I(_) | Op::I2F(_) | Op::I2I(_) | Op::FRnd(_) => { |
| false |
| } |
| |
| // Move ops |
| Op::Mov(_) | Op::Prmt(_) | Op::Sel(_) => true, |
| Op::Shfl(_) => false, |
| |
| // Predicate ops |
| Op::PLop3(_) | Op::PSetP(_) => true, |
| |
| // Uniform ops |
| Op::R2UR(_) | Op::Redux(_) => false, |
| |
| // Texture ops |
| Op::Tex(_) |
| | Op::Tld(_) |
| | Op::Tld4(_) |
| | Op::Tmml(_) |
| | Op::Txd(_) |
| | Op::Txq(_) => false, |
| |
| // Surface ops |
| Op::SuLd(_) |
| | Op::SuSt(_) |
| | Op::SuAtom(_) |
| | Op::SuLdGa(_) |
| | Op::SuStGa(_) => false, |
| |
| // Memory ops |
| Op::Ld(_) |
| | Op::Ldc(_) |
| | Op::LdSharedLock(_) |
| | Op::St(_) |
| | Op::StSCheckUnlock(_) |
| | Op::Atom(_) |
| | Op::AL2P(_) |
| | Op::ALd(_) |
| | Op::ASt(_) |
| | Op::Ipa(_) |
| | Op::CCtl(_) |
| | Op::LdTram(_) |
| | Op::MemBar(_) => false, |
| |
| // Control-flow ops |
| Op::BClear(_) |
| | Op::Break(_) |
| | Op::BSSy(_) |
| | Op::BSync(_) |
| | Op::SSy(_) |
| | Op::Sync(_) |
| | Op::Brk(_) |
| | Op::PBk(_) |
| | Op::Cont(_) |
| | Op::PCnt(_) |
| | Op::Bra(_) |
| | Op::Exit(_) |
| | Op::WarpSync(_) => false, |
| |
| // The barrier half is HW scoreboarded by the GPR isn't. When |
| // moving from a GPR to a barrier, we still need a token for WaR |
| // hazards. |
| Op::BMov(_) => false, |
| |
| // Geometry ops |
| Op::Out(_) | Op::OutFinal(_) => false, |
| |
| // Miscellaneous ops |
| Op::Bar(_) |
| | Op::TexDepBar(_) |
| | Op::CS2R(_) |
| | Op::Isberd(_) |
| | Op::ViLd(_) |
| | Op::Kill(_) |
| | Op::PixLd(_) |
| | Op::S2R(_) |
| | Op::Match(_) => false, |
| Op::Nop(_) | Op::Vote(_) => true, |
| |
| // Virtual ops |
| Op::Undef(_) |
| | Op::SrcBar(_) |
| | Op::PhiSrcs(_) |
| | Op::PhiDsts(_) |
| | Op::Copy(_) |
| | Op::Pin(_) |
| | Op::Unpin(_) |
| | Op::Swap(_) |
| | Op::ParCopy(_) |
| | Op::RegOut(_) |
| | Op::Annotate(_) => { |
| panic!("Not a hardware opcode") |
| } |
| } |
| } |
| |
| /// Some decoupled instructions don't need |
| /// scoreboards, due to our usage. |
| pub fn no_scoreboard(&self) -> bool { |
| match self { |
| Op::BClear(_) |
| | Op::Break(_) |
| | Op::BSSy(_) |
| | Op::BSync(_) |
| | Op::SSy(_) |
| | Op::Sync(_) |
| | Op::Brk(_) |
| | Op::PBk(_) |
| | Op::Cont(_) |
| | Op::PCnt(_) |
| | Op::Bra(_) |
| | Op::Exit(_) => true, |
| _ => false, |
| } |
| } |
| } |
| |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub enum PredRef { |
| None, |
| SSA(SSAValue), |
| Reg(RegRef), |
| } |
| |
| impl PredRef { |
| #[allow(dead_code)] |
| pub fn as_reg(&self) -> Option<&RegRef> { |
| match self { |
| PredRef::Reg(r) => Some(r), |
| _ => None, |
| } |
| } |
| |
| #[allow(dead_code)] |
| pub fn as_ssa(&self) -> Option<&SSAValue> { |
| match self { |
| PredRef::SSA(r) => Some(r), |
| _ => None, |
| } |
| } |
| |
| pub fn is_none(&self) -> bool { |
| matches!(self, PredRef::None) |
| } |
| |
| pub fn iter_ssa(&self) -> slice::Iter<'_, SSAValue> { |
| match self { |
| PredRef::None | PredRef::Reg(_) => &[], |
| PredRef::SSA(ssa) => slice::from_ref(ssa), |
| } |
| .iter() |
| } |
| |
| pub fn iter_ssa_mut(&mut self) -> slice::IterMut<'_, SSAValue> { |
| match self { |
| PredRef::None | PredRef::Reg(_) => &mut [], |
| PredRef::SSA(ssa) => slice::from_mut(ssa), |
| } |
| .iter_mut() |
| } |
| } |
| |
| impl From<RegRef> for PredRef { |
| fn from(reg: RegRef) -> PredRef { |
| PredRef::Reg(reg) |
| } |
| } |
| |
| impl From<SSAValue> for PredRef { |
| fn from(ssa: SSAValue) -> PredRef { |
| PredRef::SSA(ssa) |
| } |
| } |
| |
| impl fmt::Display for PredRef { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| PredRef::None => write!(f, "pT"), |
| PredRef::SSA(ssa) => ssa.fmt(f), |
| PredRef::Reg(reg) => reg.fmt(f), |
| } |
| } |
| } |
| |
| #[derive(Clone, Copy)] |
| pub struct Pred { |
| pub pred_ref: PredRef, |
| pub pred_inv: bool, |
| } |
| |
| impl Pred { |
| pub fn is_true(&self) -> bool { |
| self.pred_ref.is_none() && !self.pred_inv |
| } |
| |
| pub fn is_false(&self) -> bool { |
| self.pred_ref.is_none() && self.pred_inv |
| } |
| |
| pub fn iter_ssa(&self) -> slice::Iter<'_, SSAValue> { |
| self.pred_ref.iter_ssa() |
| } |
| |
| pub fn iter_ssa_mut(&mut self) -> slice::IterMut<'_, SSAValue> { |
| self.pred_ref.iter_ssa_mut() |
| } |
| |
| pub fn bnot(self) -> Self { |
| Pred { |
| pred_ref: self.pred_ref, |
| pred_inv: !self.pred_inv, |
| } |
| } |
| } |
| |
| impl From<bool> for Pred { |
| fn from(b: bool) -> Self { |
| Pred { |
| pred_ref: PredRef::None, |
| pred_inv: !b, |
| } |
| } |
| } |
| |
| impl<T: Into<PredRef>> From<T> for Pred { |
| fn from(p: T) -> Self { |
| Pred { |
| pred_ref: p.into(), |
| pred_inv: false, |
| } |
| } |
| } |
| |
| impl fmt::Display for Pred { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| if self.pred_inv { |
| write!(f, "!")?; |
| } |
| self.pred_ref.fmt(f) |
| } |
| } |
| |
| pub const MIN_INSTR_DELAY: u8 = 1; |
| |
| pub struct InstrDeps { |
| pub delay: u8, |
| pub yld: bool, |
| wr_bar: i8, |
| rd_bar: i8, |
| pub wt_bar_mask: u8, |
| pub reuse_mask: u8, |
| } |
| |
| impl InstrDeps { |
| pub fn new() -> InstrDeps { |
| InstrDeps { |
| delay: 0, |
| yld: false, |
| wr_bar: -1, |
| rd_bar: -1, |
| wt_bar_mask: 0, |
| reuse_mask: 0, |
| } |
| } |
| |
| pub fn rd_bar(&self) -> Option<u8> { |
| if self.rd_bar < 0 { |
| None |
| } else { |
| Some(self.rd_bar.try_into().unwrap()) |
| } |
| } |
| |
| pub fn wr_bar(&self) -> Option<u8> { |
| if self.wr_bar < 0 { |
| None |
| } else { |
| Some(self.wr_bar.try_into().unwrap()) |
| } |
| } |
| |
| pub fn set_delay(&mut self, delay: u8) { |
| self.delay = delay; |
| } |
| |
| pub fn set_yield(&mut self, yld: bool) { |
| self.yld = yld; |
| } |
| |
| pub fn set_rd_bar(&mut self, idx: u8) { |
| assert!(idx < 6); |
| self.rd_bar = idx.try_into().unwrap(); |
| } |
| |
| pub fn set_wr_bar(&mut self, idx: u8) { |
| assert!(idx < 6); |
| self.wr_bar = idx.try_into().unwrap(); |
| } |
| |
| pub fn add_wt_bar(&mut self, idx: u8) { |
| self.add_wt_bar_mask(1 << idx); |
| } |
| |
| pub fn add_wt_bar_mask(&mut self, bar_mask: u8) { |
| assert!(bar_mask < 1 << 6); |
| self.wt_bar_mask |= bar_mask; |
| } |
| |
| #[allow(dead_code)] |
| pub fn add_reuse(&mut self, idx: u8) { |
| assert!(idx < 6); |
| self.reuse_mask |= 1_u8 << idx; |
| } |
| } |
| |
| impl fmt::Display for InstrDeps { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| if self.delay > 0 { |
| write!(f, " delay={}", self.delay)?; |
| } |
| if self.wt_bar_mask != 0 { |
| write!(f, " wt={:06b}", self.wt_bar_mask)?; |
| } |
| if self.rd_bar >= 0 { |
| write!(f, " rd:{}", self.rd_bar)?; |
| } |
| if self.wr_bar >= 0 { |
| write!(f, " wr:{}", self.wr_bar)?; |
| } |
| if self.reuse_mask != 0 { |
| write!(f, " reuse={:06b}", self.reuse_mask)?; |
| } |
| if self.yld { |
| write!(f, " yld")?; |
| } |
| Ok(()) |
| } |
| } |
| |
| pub struct Instr { |
| pub pred: Pred, |
| pub op: Op, |
| pub deps: InstrDeps, |
| } |
| |
| impl Instr { |
| pub fn new(op: impl Into<Op>) -> Self { |
| Self { |
| op: op.into(), |
| pred: true.into(), |
| deps: InstrDeps::new(), |
| } |
| } |
| |
| pub fn dsts(&self) -> &[Dst] { |
| self.op.dsts_as_slice() |
| } |
| |
| pub fn dsts_mut(&mut self) -> &mut [Dst] { |
| self.op.dsts_as_mut_slice() |
| } |
| |
| pub fn srcs(&self) -> &[Src] { |
| self.op.srcs_as_slice() |
| } |
| |
| pub fn srcs_mut(&mut self) -> &mut [Src] { |
| self.op.srcs_as_mut_slice() |
| } |
| |
| pub fn src_types(&self) -> SrcTypeList { |
| self.op.src_types() |
| } |
| |
| pub fn for_each_ssa_use(&self, mut f: impl FnMut(&SSAValue)) { |
| for ssa in self.pred.iter_ssa() { |
| f(ssa); |
| } |
| for src in self.srcs() { |
| for ssa in src.iter_ssa() { |
| f(ssa); |
| } |
| } |
| } |
| |
| pub fn for_each_ssa_use_mut(&mut self, mut f: impl FnMut(&mut SSAValue)) { |
| for ssa in self.pred.iter_ssa_mut() { |
| f(ssa); |
| } |
| for src in self.srcs_mut() { |
| for ssa in src.iter_ssa_mut() { |
| f(ssa); |
| } |
| } |
| } |
| |
| pub fn for_each_ssa_def(&self, mut f: impl FnMut(&SSAValue)) { |
| for dst in self.dsts() { |
| for ssa in dst.iter_ssa() { |
| f(ssa); |
| } |
| } |
| } |
| |
| pub fn for_each_ssa_def_mut(&mut self, mut f: impl FnMut(&mut SSAValue)) { |
| for dst in self.dsts_mut() { |
| for ssa in dst.iter_ssa_mut() { |
| f(ssa); |
| } |
| } |
| } |
| |
| pub fn is_branch(&self) -> bool { |
| self.op.is_branch() |
| } |
| |
| /// Returns true if `self`` is a branch instruction that is always taken. |
| /// It returns false for non branch instructions. |
| pub fn is_branch_always_taken(&self) -> bool { |
| if self.pred.is_true() { |
| match &self.op { |
| Op::Bra(bra) => bra.cond.is_true(), |
| _ => self.is_branch(), |
| } |
| } else { |
| false |
| } |
| } |
| |
| pub fn uses_global_mem(&self) -> bool { |
| match &self.op { |
| Op::Atom(op) => op.mem_space != MemSpace::Local, |
| Op::Ld(op) => op.access.space != MemSpace::Local, |
| Op::St(op) => op.access.space != MemSpace::Local, |
| Op::SuAtom(_) |
| | Op::SuLd(_) |
| | Op::SuSt(_) |
| | Op::SuLdGa(_) |
| | Op::SuStGa(_) => true, |
| _ => false, |
| } |
| } |
| |
| pub fn writes_global_mem(&self) -> bool { |
| match &self.op { |
| Op::Atom(op) => matches!(op.mem_space, MemSpace::Global(_)), |
| Op::St(op) => matches!(op.access.space, MemSpace::Global(_)), |
| Op::SuAtom(_) | Op::SuSt(_) | Op::SuStGa(_) => true, |
| _ => false, |
| } |
| } |
| |
| pub fn can_eliminate(&self) -> bool { |
| match &self.op { |
| Op::ASt(_) |
| | Op::SuSt(_) |
| | Op::SuStGa(_) |
| | Op::SuAtom(_) |
| | Op::LdSharedLock(_) |
| | Op::St(_) |
| | Op::StSCheckUnlock(_) |
| | Op::Atom(_) |
| | Op::CCtl(_) |
| | Op::MemBar(_) |
| | Op::Kill(_) |
| | Op::Nop(_) |
| | Op::BSync(_) |
| | Op::Bra(_) |
| | Op::SSy(_) |
| | Op::Sync(_) |
| | Op::Brk(_) |
| | Op::PBk(_) |
| | Op::Cont(_) |
| | Op::PCnt(_) |
| | Op::Exit(_) |
| | Op::WarpSync(_) |
| | Op::Bar(_) |
| | Op::TexDepBar(_) |
| | Op::RegOut(_) |
| | Op::Out(_) |
| | Op::OutFinal(_) |
| | Op::Annotate(_) => false, |
| Op::BMov(op) => !op.clear, |
| _ => true, |
| } |
| } |
| |
| pub fn is_uniform(&self) -> bool { |
| match &self.op { |
| Op::PhiDsts(_) => false, |
| op => op.is_uniform(), |
| } |
| } |
| |
| pub fn needs_yield(&self) -> bool { |
| matches!(&self.op, Op::Bar(_) | Op::BSync(_)) |
| } |
| |
| fn fmt_pred(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| if !self.pred.is_true() { |
| write!(f, "@{} ", self.pred)?; |
| } |
| Ok(()) |
| } |
| } |
| |
| impl fmt::Display for Instr { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "{} {}{}", Fmt(|f| self.fmt_pred(f)), self.op, self.deps) |
| } |
| } |
| |
| impl<T: Into<Op>> From<T> for Instr { |
| fn from(value: T) -> Self { |
| Self::new(value) |
| } |
| } |
| |
| pub type MappedInstrs = SmallVec<Instr>; |
| |
| pub struct BasicBlock { |
| pub label: Label, |
| |
| /// Whether or not this block is uniform |
| /// |
| /// If true, then all non-exited lanes in a warp which execute this block |
| /// are guaranteed to execute it together |
| pub uniform: bool, |
| |
| pub instrs: Vec<Instr>, |
| } |
| |
| impl BasicBlock { |
| pub fn map_instrs(&mut self, mut map: impl FnMut(Instr) -> MappedInstrs) { |
| let mut instrs = Vec::new(); |
| for i in self.instrs.drain(..) { |
| match map(i) { |
| MappedInstrs::None => (), |
| MappedInstrs::One(i) => { |
| instrs.push(i); |
| } |
| MappedInstrs::Many(mut v) => { |
| instrs.append(&mut v); |
| } |
| } |
| } |
| self.instrs = instrs; |
| } |
| |
| pub fn phi_dsts_ip(&self) -> Option<usize> { |
| for (ip, instr) in self.instrs.iter().enumerate() { |
| match &instr.op { |
| Op::Annotate(_) => (), |
| Op::PhiDsts(_) => return Some(ip), |
| _ => break, |
| } |
| } |
| None |
| } |
| |
| pub fn phi_dsts(&self) -> Option<&OpPhiDsts> { |
| self.phi_dsts_ip().map(|ip| match &self.instrs[ip].op { |
| Op::PhiDsts(phi) => phi.deref(), |
| _ => panic!("Expected to find the phi"), |
| }) |
| } |
| |
| #[allow(dead_code)] |
| pub fn phi_dsts_mut(&mut self) -> Option<&mut OpPhiDsts> { |
| self.phi_dsts_ip().map(|ip| match &mut self.instrs[ip].op { |
| Op::PhiDsts(phi) => phi.deref_mut(), |
| _ => panic!("Expected to find the phi"), |
| }) |
| } |
| |
| pub fn phi_srcs_ip(&self) -> Option<usize> { |
| for (ip, instr) in self.instrs.iter().enumerate().rev() { |
| match &instr.op { |
| Op::Annotate(_) => (), |
| Op::PhiSrcs(_) => return Some(ip), |
| _ if instr.is_branch() => (), |
| _ => break, |
| } |
| } |
| None |
| } |
| pub fn phi_srcs(&self) -> Option<&OpPhiSrcs> { |
| self.phi_srcs_ip().map(|ip| match &self.instrs[ip].op { |
| Op::PhiSrcs(phi) => phi.deref(), |
| _ => panic!("Expected to find the phi"), |
| }) |
| } |
| |
| pub fn phi_srcs_mut(&mut self) -> Option<&mut OpPhiSrcs> { |
| self.phi_srcs_ip().map(|ip| match &mut self.instrs[ip].op { |
| Op::PhiSrcs(phi) => phi.deref_mut(), |
| _ => panic!("Expected to find the phi"), |
| }) |
| } |
| |
| pub fn branch(&self) -> Option<&Instr> { |
| if let Some(i) = self.instrs.last() { |
| if i.is_branch() { |
| Some(i) |
| } else { |
| None |
| } |
| } else { |
| None |
| } |
| } |
| |
| pub fn branch_ip(&self) -> Option<usize> { |
| if let Some(i) = self.instrs.last() { |
| if i.is_branch() { |
| Some(self.instrs.len() - 1) |
| } else { |
| None |
| } |
| } else { |
| None |
| } |
| } |
| |
| #[allow(dead_code)] |
| pub fn branch_mut(&mut self) -> Option<&mut Instr> { |
| if let Some(i) = self.instrs.last_mut() { |
| if i.is_branch() { |
| Some(i) |
| } else { |
| None |
| } |
| } else { |
| None |
| } |
| } |
| |
| pub fn falls_through(&self) -> bool { |
| if let Some(i) = self.branch() { |
| !i.is_branch_always_taken() |
| } else { |
| true |
| } |
| } |
| } |
| |
| pub struct Function { |
| pub ssa_alloc: SSAValueAllocator, |
| pub phi_alloc: PhiAllocator, |
| pub blocks: CFG<BasicBlock>, |
| } |
| |
| impl Function { |
| pub fn map_instrs( |
| &mut self, |
| mut map: impl FnMut(Instr, &mut SSAValueAllocator) -> MappedInstrs, |
| ) { |
| let alloc = &mut self.ssa_alloc; |
| for b in &mut self.blocks { |
| b.map_instrs(|i| map(i, alloc)); |
| } |
| } |
| } |
| |
| impl fmt::Display for Function { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let mut pred_width = 0; |
| let mut dsts_width = 0; |
| let mut op_width = 0; |
| |
| let mut blocks = Vec::new(); |
| for b in &self.blocks { |
| let mut instrs = Vec::new(); |
| for i in &b.instrs { |
| let mut pred = String::new(); |
| write!(pred, "{}", Fmt(|f| i.fmt_pred(f)))?; |
| let mut dsts = String::new(); |
| write!(dsts, "{}", Fmt(|f| i.op.fmt_dsts(f)))?; |
| let mut op = String::new(); |
| write!(op, "{}", Fmt(|f| i.op.fmt_op(f)))?; |
| let mut deps = String::new(); |
| write!(deps, "{}", i.deps)?; |
| |
| pred_width = max(pred_width, pred.len()); |
| dsts_width = max(dsts_width, dsts.len()); |
| op_width = max(op_width, op.len()); |
| let is_annotation = matches!(i.op, Op::Annotate(_)); |
| |
| instrs.push((pred, dsts, op, deps, is_annotation)); |
| } |
| blocks.push(instrs); |
| } |
| |
| for (i, mut b) in blocks.drain(..).enumerate() { |
| let u = if self.blocks[i].uniform { ".u" } else { "" }; |
| write!(f, "block{u} {} {} [", i, self.blocks[i].label)?; |
| for (pi, p) in self.blocks.pred_indices(i).iter().enumerate() { |
| if pi > 0 { |
| write!(f, ", ")?; |
| } |
| write!(f, "{}", p)?; |
| } |
| write!(f, "] -> {{\n")?; |
| |
| for (pred, dsts, op, deps, is_annotation) in b.drain(..) { |
| let eq_sym = if dsts.is_empty() { " " } else { "=" }; |
| if is_annotation { |
| write!(f, "\n{}\n", op)?; |
| } else if deps.is_empty() { |
| write!( |
| f, |
| "{:<pred_width$} {:<dsts_width$} {} {}\n", |
| pred, dsts, eq_sym, op, |
| )?; |
| } else { |
| write!( |
| f, |
| "{:<pred_width$} {:<dsts_width$} {} \ |
| {:<op_width$} //{}\n", |
| pred, dsts, eq_sym, op, deps, |
| )?; |
| } |
| } |
| |
| write!(f, "}} -> [")?; |
| for (si, s) in self.blocks.succ_indices(i).iter().enumerate() { |
| if si > 0 { |
| write!(f, ", ")?; |
| } |
| write!(f, "{}", s)?; |
| } |
| write!(f, "]\n")?; |
| } |
| Ok(()) |
| } |
| } |
| |
| #[derive(Debug)] |
| pub struct ComputeShaderInfo { |
| pub local_size: [u16; 3], |
| pub smem_size: u16, |
| } |
| |
| #[derive(Debug)] |
| pub struct FragmentShaderInfo { |
| pub uses_kill: bool, |
| pub does_interlock: bool, |
| pub post_depth_coverage: bool, |
| pub early_fragment_tests: bool, |
| pub uses_sample_shading: bool, |
| } |
| |
| #[derive(Debug)] |
| pub struct GeometryShaderInfo { |
| pub passthrough_enable: bool, |
| pub stream_out_mask: u8, |
| pub threads_per_input_primitive: u8, |
| pub output_topology: OutputTopology, |
| pub max_output_vertex_count: u16, |
| } |
| |
| impl Default for GeometryShaderInfo { |
| fn default() -> Self { |
| Self { |
| passthrough_enable: false, |
| stream_out_mask: 0, |
| threads_per_input_primitive: 0, |
| output_topology: OutputTopology::LineStrip, |
| max_output_vertex_count: 0, |
| } |
| } |
| } |
| |
| #[derive(Debug)] |
| pub struct TessellationInitShaderInfo { |
| pub per_patch_attribute_count: u8, |
| pub threads_per_patch: u8, |
| } |
| |
| #[repr(u8)] |
| #[derive(Clone, Copy, Debug)] |
| pub enum TessellationDomain { |
| Isoline = NAK_TS_DOMAIN_ISOLINE, |
| Triangle = NAK_TS_DOMAIN_TRIANGLE, |
| Quad = NAK_TS_DOMAIN_QUAD, |
| } |
| |
| #[repr(u8)] |
| #[derive(Clone, Copy, Debug)] |
| pub enum TessellationSpacing { |
| Integer = NAK_TS_SPACING_INTEGER, |
| FractionalOdd = NAK_TS_SPACING_FRACT_ODD, |
| FractionalEven = NAK_TS_SPACING_FRACT_EVEN, |
| } |
| |
| #[repr(u8)] |
| #[derive(Clone, Copy, Debug)] |
| pub enum TessellationPrimitives { |
| Points = NAK_TS_PRIMS_POINTS, |
| Lines = NAK_TS_PRIMS_LINES, |
| TrianglesCW = NAK_TS_PRIMS_TRIANGLES_CW, |
| TrianglesCCW = NAK_TS_PRIMS_TRIANGLES_CCW, |
| } |
| |
| #[derive(Debug)] |
| pub struct TessellationShaderInfo { |
| pub domain: TessellationDomain, |
| pub spacing: TessellationSpacing, |
| pub primitives: TessellationPrimitives, |
| } |
| |
| #[derive(Debug)] |
| pub enum ShaderStageInfo { |
| Compute(ComputeShaderInfo), |
| Vertex, |
| Fragment(FragmentShaderInfo), |
| Geometry(GeometryShaderInfo), |
| TessellationInit(TessellationInitShaderInfo), |
| Tessellation(TessellationShaderInfo), |
| } |
| |
| #[derive(Debug, Default)] |
| pub struct SysValInfo { |
| pub ab: u32, |
| pub c: u16, |
| } |
| |
| #[derive(Debug)] |
| pub struct VtgIoInfo { |
| pub sysvals_in: SysValInfo, |
| pub sysvals_in_d: u8, |
| pub sysvals_out: SysValInfo, |
| pub sysvals_out_d: u8, |
| pub attr_in: [u32; 4], |
| pub attr_out: [u32; 4], |
| pub store_req_start: u8, |
| pub store_req_end: u8, |
| pub clip_enable: u8, |
| pub cull_enable: u8, |
| pub xfb: Option<Box<nak_xfb_info>>, |
| } |
| |
| impl VtgIoInfo { |
| fn mark_attrs(&mut self, addrs: Range<u16>, written: bool) { |
| let sysvals = if written { |
| &mut self.sysvals_out |
| } else { |
| &mut self.sysvals_in |
| }; |
| |
| let sysvals_d = if written { |
| &mut self.sysvals_out_d |
| } else { |
| &mut self.sysvals_in_d |
| }; |
| |
| let mut attr = BitMutView::new(if written { |
| &mut self.attr_out |
| } else { |
| &mut self.attr_in |
| }); |
| |
| let mut addrs = addrs; |
| addrs.start &= !3; |
| for addr in addrs.step_by(4) { |
| if addr < 0x080 { |
| sysvals.ab |= 1 << (addr / 4); |
| } else if addr < 0x280 { |
| let attr_idx = (addr - 0x080) as usize / 4; |
| attr.set_bit(attr_idx, true); |
| } else if addr < 0x2c0 { |
| panic!("FF color I/O not supported"); |
| } else if addr < 0x300 { |
| sysvals.c |= 1 << ((addr - 0x2c0) / 4); |
| } else if addr >= 0x3a0 && addr < 0x3c0 { |
| *sysvals_d |= 1 << ((addr - 0x3a0) / 4); |
| } |
| } |
| } |
| |
| pub fn mark_attrs_read(&mut self, addrs: Range<u16>) { |
| self.mark_attrs(addrs, false); |
| } |
| |
| pub fn mark_attrs_written(&mut self, addrs: Range<u16>) { |
| self.mark_attrs(addrs, true); |
| } |
| |
| pub fn attr_written(&self, addr: u16) -> bool { |
| if addr < 0x080 { |
| self.sysvals_out.ab & (1 << (addr / 4)) != 0 |
| } else if addr < 0x280 { |
| let attr_idx = (addr - 0x080) as usize / 4; |
| BitView::new(&self.attr_out).get_bit(attr_idx) |
| } else if addr < 0x2c0 { |
| panic!("FF color I/O not supported"); |
| } else if addr < 0x300 { |
| self.sysvals_out.c & (1 << ((addr - 0x2c0) / 4)) != 0 |
| } else if addr >= 0x3a0 && addr < 0x3c0 { |
| self.sysvals_out_d & (1 << ((addr - 0x3a0) / 4)) != 0 |
| } else { |
| panic!("Unknown I/O address"); |
| } |
| } |
| |
| pub fn mark_store_req(&mut self, addrs: Range<u16>) { |
| let start = (addrs.start / 4).try_into().unwrap(); |
| let end = ((addrs.end - 1) / 4).try_into().unwrap(); |
| self.store_req_start = min(self.store_req_start, start); |
| self.store_req_end = max(self.store_req_end, end); |
| } |
| } |
| |
| #[derive(Debug)] |
| pub struct FragmentIoInfo { |
| pub sysvals_in: SysValInfo, |
| pub sysvals_in_d: [PixelImap; 8], |
| pub attr_in: [PixelImap; 128], |
| pub barycentric_attr_in: [u32; 4], |
| |
| pub reads_sample_mask: bool, |
| pub writes_color: u32, |
| pub writes_sample_mask: bool, |
| pub writes_depth: bool, |
| } |
| |
| impl FragmentIoInfo { |
| pub fn mark_attr_read(&mut self, addr: u16, interp: PixelImap) { |
| if addr < 0x080 { |
| self.sysvals_in.ab |= 1 << (addr / 4); |
| } else if addr < 0x280 { |
| let attr_idx = (addr - 0x080) as usize / 4; |
| self.attr_in[attr_idx] = interp; |
| } else if addr < 0x2c0 { |
| panic!("FF color I/O not supported"); |
| } else if addr < 0x300 { |
| self.sysvals_in.c |= 1 << ((addr - 0x2c0) / 4); |
| } else if addr >= 0x3a0 && addr < 0x3c0 { |
| let attr_idx = (addr - 0x3a0) as usize / 4; |
| self.sysvals_in_d[attr_idx] = interp; |
| } |
| } |
| |
| pub fn mark_barycentric_attr_in(&mut self, addr: u16) { |
| assert!(addr >= 0x80 && addr < 0x280); |
| |
| let mut attr = BitMutView::new(&mut self.barycentric_attr_in); |
| |
| let attr_idx = (addr - 0x080) as usize / 4; |
| attr.set_bit(attr_idx, true); |
| } |
| } |
| |
| #[derive(Debug)] |
| pub enum ShaderIoInfo { |
| None, |
| Vtg(VtgIoInfo), |
| Fragment(FragmentIoInfo), |
| } |
| |
| #[derive(Debug)] |
| pub struct ShaderInfo { |
| pub max_warps_per_sm: u32, |
| pub num_gprs: u8, |
| pub num_control_barriers: u8, |
| pub num_instrs: u32, |
| pub num_static_cycles: u64, |
| pub num_spills_to_mem: u32, |
| pub num_fills_from_mem: u32, |
| pub num_spills_to_reg: u32, |
| pub num_fills_from_reg: u32, |
| pub slm_size: u32, |
| pub max_crs_depth: u32, |
| pub uses_global_mem: bool, |
| pub writes_global_mem: bool, |
| pub uses_fp64: bool, |
| pub stage: ShaderStageInfo, |
| pub io: ShaderIoInfo, |
| } |
| |
| pub trait ShaderModel { |
| fn sm(&self) -> u8; |
| |
| #[allow(dead_code)] |
| fn is_fermi(&self) -> bool { |
| self.sm() >= 20 && self.sm() < 30 |
| } |
| |
| #[allow(dead_code)] |
| fn is_kepler_a(&self) -> bool { |
| self.sm() >= 30 && self.sm() < 32 |
| } |
| |
| #[allow(dead_code)] |
| fn is_kepler_b(&self) -> bool { |
| // TK1 is SM 3.2 and desktop Kepler B is SM 3.3+ |
| self.sm() >= 32 && self.sm() < 40 |
| } |
| |
| #[allow(dead_code)] |
| fn is_kepler(&self) -> bool { |
| self.is_kepler_a() || self.is_kepler_b() |
| } |
| |
| // The following helpers are pulled from GetSpaVersion in the open-source |
| // NVIDIA kernel driver sources |
| |
| #[allow(dead_code)] |
| fn is_maxwell(&self) -> bool { |
| self.sm() >= 50 && self.sm() < 60 |
| } |
| |
| #[allow(dead_code)] |
| fn is_pascal(&self) -> bool { |
| self.sm() >= 60 && self.sm() < 70 |
| } |
| |
| #[allow(dead_code)] |
| fn is_volta(&self) -> bool { |
| self.sm() >= 70 && self.sm() < 73 |
| } |
| |
| #[allow(dead_code)] |
| fn is_turing(&self) -> bool { |
| self.sm() >= 73 && self.sm() < 80 |
| } |
| |
| #[allow(dead_code)] |
| fn is_ampere(&self) -> bool { |
| self.sm() >= 80 && self.sm() < 89 |
| } |
| |
| #[allow(dead_code)] |
| fn is_ada(&self) -> bool { |
| self.sm() == 89 |
| } |
| |
| #[allow(dead_code)] |
| fn is_hopper(&self) -> bool { |
| self.sm() >= 90 && self.sm() < 100 |
| } |
| |
| #[allow(dead_code)] |
| fn is_blackwell_a(&self) -> bool { |
| self.sm() >= 100 && self.sm() < 110 |
| } |
| |
| #[allow(dead_code)] |
| fn is_blackwell_b(&self) -> bool { |
| self.sm() >= 120 && self.sm() < 130 |
| } |
| |
| #[allow(dead_code)] |
| fn is_blackwell(&self) -> bool { |
| self.is_blackwell_a() || self.is_blackwell_b() |
| } |
| |
| fn num_regs(&self, file: RegFile) -> u32; |
| fn hw_reserved_gprs(&self) -> u32; |
| fn crs_size(&self, max_crs_depth: u32) -> u32; |
| |
| fn op_can_be_uniform(&self, op: &Op) -> bool; |
| |
| // Scheduling information |
| fn op_needs_scoreboard(&self, op: &Op) -> bool { |
| !op.no_scoreboard() && !op.has_fixed_latency(self.sm()) |
| } |
| |
| /// Latency before another non-NOP can execute |
| fn exec_latency(&self, op: &Op) -> u32; |
| |
| /// Read-after-read latency |
| fn raw_latency( |
| &self, |
| write: &Op, |
| dst_idx: usize, |
| read: &Op, |
| src_idx: usize, |
| ) -> u32; |
| |
| /// Write-after-read latency |
| fn war_latency( |
| &self, |
| read: &Op, |
| src_idx: usize, |
| write: &Op, |
| dst_idx: usize, |
| ) -> u32; |
| |
| /// Write-after-write latency |
| fn waw_latency( |
| &self, |
| a: &Op, |
| a_dst_idx: usize, |
| a_has_pred: bool, |
| b: &Op, |
| b_dst_idx: usize, |
| ) -> u32; |
| |
| /// Predicate read-after-write latency |
| fn paw_latency(&self, write: &Op, dst_idx: usize) -> u32; |
| |
| /// Worst-case access-after-write latency |
| fn worst_latency(&self, write: &Op, dst_idx: usize) -> u32; |
| |
| /// Maximum encodable instruction delay |
| fn max_instr_delay(&self) -> u8; |
| |
| fn legalize_op(&self, b: &mut LegalizeBuilder, op: &mut Op); |
| fn encode_shader(&self, s: &Shader<'_>) -> Vec<u32>; |
| } |
| |
| /// For compute shaders, large values of local_size impose an additional limit |
| /// on the number of GPRs per thread |
| pub fn gpr_limit_from_local_size(local_size: &[u16; 3]) -> u32 { |
| fn prev_multiple_of(x: u32, y: u32) -> u32 { |
| (x / y) * y |
| } |
| |
| let local_size = local_size[0] * local_size[1] * local_size[2]; |
| // Warps are allocated in multiples of 4 |
| // Multiply that by 32 threads/warp |
| let local_size = local_size.next_multiple_of(4 * 32) as u32; |
| let total_regs: u32 = 65536; |
| |
| let out = total_regs / local_size; |
| // GPRs are allocated in multiples of 8 |
| let out = prev_multiple_of(out, 8); |
| min(out, 255) |
| } |
| |
| pub fn max_warps_per_sm(gprs: u32) -> u32 { |
| fn prev_multiple_of(x: u32, y: u32) -> u32 { |
| (x / y) * y |
| } |
| |
| // TODO: Take local_size and shared mem limit into account for compute |
| let total_regs: u32 = 65536; |
| // GPRs are allocated in multiples of 8 |
| let gprs = gprs.next_multiple_of(8); |
| let max_warps = prev_multiple_of((total_regs / 32) / gprs, 4); |
| min(max_warps, 48) |
| } |
| |
| pub struct Shader<'a> { |
| pub sm: &'a dyn ShaderModel, |
| pub info: ShaderInfo, |
| pub functions: Vec<Function>, |
| } |
| |
| impl Shader<'_> { |
| pub fn for_each_instr(&self, f: &mut impl FnMut(&Instr)) { |
| for func in &self.functions { |
| for b in &func.blocks { |
| for i in &b.instrs { |
| f(i); |
| } |
| } |
| } |
| } |
| |
| pub fn map_instrs( |
| &mut self, |
| mut map: impl FnMut(Instr, &mut SSAValueAllocator) -> MappedInstrs, |
| ) { |
| for f in &mut self.functions { |
| f.map_instrs(&mut map); |
| } |
| } |
| |
| /// Remove all annotations, presumably before encoding the shader. |
| pub fn remove_annotations(&mut self) { |
| self.map_instrs(|instr: Instr, _| -> MappedInstrs { |
| if matches!(instr.op, Op::Annotate(_)) { |
| MappedInstrs::None |
| } else { |
| MappedInstrs::One(instr) |
| } |
| }) |
| } |
| |
| pub fn gather_info(&mut self) { |
| let mut num_instrs = 0; |
| let mut uses_global_mem = false; |
| let mut writes_global_mem = false; |
| let mut uses_fp64 = false; |
| |
| self.for_each_instr(&mut |instr| { |
| num_instrs += 1; |
| |
| if !uses_global_mem { |
| uses_global_mem = instr.uses_global_mem(); |
| } |
| |
| if !writes_global_mem { |
| writes_global_mem = instr.writes_global_mem(); |
| } |
| |
| if !uses_fp64 { |
| uses_fp64 = instr.op.is_fp64(); |
| } |
| }); |
| |
| self.info.num_instrs = num_instrs; |
| self.info.uses_global_mem = uses_global_mem; |
| self.info.writes_global_mem = writes_global_mem; |
| self.info.uses_fp64 = uses_fp64; |
| |
| self.info.max_warps_per_sm = max_warps_per_sm( |
| self.info.num_gprs as u32 + self.sm.hw_reserved_gprs(), |
| ); |
| } |
| } |
| |
| impl fmt::Display for Shader<'_> { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| for func in &self.functions { |
| write!(f, "{}", func)?; |
| } |
| Ok(()) |
| } |
| } |