blob: 71f8f231747fe74e981477f59e1e1a04632da3e6 [file] [log] [blame] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx10.2-512 -mattr=+amx-avx512 -verify-machineinstrs | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx10.2-512 -mattr=+amx-avx512 -verify-machineinstrs -enable-ipra | FileCheck -check-prefix=IPRA %s
; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx10.2-512 -mattr=+amx-avx512 -verify-machineinstrs | FileCheck -check-prefix=O0 %s
@buf = dso_local global [3072 x i8] zeroinitializer, align 64
define internal void @foo() {
; CHECK-LABEL: foo:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: retq
;
; IPRA-LABEL: foo:
; IPRA: # %bb.0: # %entry
; IPRA-NEXT: retq
;
; O0-LABEL: foo:
; O0: # %bb.0: # %entry
; O0-NEXT: retq
entry:
ret void
}
define dso_local <16 x i32> @test_api(i16 signext %0, i16 signext %1) nounwind {
; CHECK-LABEL: test_api:
; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: subq $2112, %rsp # imm = 0x840
; CHECK-NEXT: movl %esi, %ebx
; CHECK-NEXT: movl %edi, %ebp
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovups %zmm0, (%rsp)
; CHECK-NEXT: movb $1, (%rsp)
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
; CHECK-NEXT: ldtilecfg (%rsp)
; CHECK-NEXT: movl $buf, %eax
; CHECK-NEXT: movl $32, %ecx
; CHECK-NEXT: movw $8, %r14w
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm0
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tilestored %tmm0, 1088(%rsp,%rax) # 1024-byte Folded Spill
; CHECK-NEXT: movl $buf+1024, %eax
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm1
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tilestored %tmm1, 64(%rsp,%rax) # 1024-byte Folded Spill
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg (%rsp)
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tileloadd 64(%rsp,%rax), %tmm1 # 1024-byte Folded Reload
; CHECK-NEXT: tilemovrow $2, %tmm1, %zmm0
; CHECK-NEXT: tileloadd 1088(%rsp,%rax), %tmm0 # 1024-byte Folded Reload
; CHECK-NEXT: tilemovrow $2, %tmm0, %zmm1
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: addq $2112, %rsp # imm = 0x840
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: tilerelease
; CHECK-NEXT: retq
;
; IPRA-LABEL: test_api:
; IPRA: # %bb.0:
; IPRA-NEXT: subq $72, %rsp
; IPRA-NEXT: vxorps %xmm0, %xmm0, %xmm0
; IPRA-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp)
; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp)
; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp)
; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp)
; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp)
; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; IPRA-NEXT: movl $buf, %eax
; IPRA-NEXT: movl $32, %ecx
; IPRA-NEXT: movw $8, %dx
; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm0
; IPRA-NEXT: movl $buf+1024, %eax
; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm1
; IPRA-NEXT: callq foo
; IPRA-NEXT: tilemovrow $2, %tmm1, %zmm0
; IPRA-NEXT: tilemovrow $2, %tmm0, %zmm1
; IPRA-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; IPRA-NEXT: addq $72, %rsp
; IPRA-NEXT: tilerelease
; IPRA-NEXT: retq
;
; O0-LABEL: test_api:
; O0: # %bb.0:
; O0-NEXT: pushq %rbp
; O0-NEXT: movq %rsp, %rbp
; O0-NEXT: andq $-1024, %rsp # imm = 0xFC00
; O0-NEXT: subq $4096, %rsp # imm = 0x1000
; O0-NEXT: vpxor %xmm0, %xmm0, %xmm0
; O0-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
; O0-NEXT: movb $1, {{[0-9]+}}(%rsp)
; O0-NEXT: movw %si, %cx
; O0-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; O0-NEXT: movw %di, %ax
; O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; O0-NEXT: movl $buf, %esi
; O0-NEXT: movl $32, %edi
; O0-NEXT: movw $8, %dx
; O0-NEXT: # implicit-def: $al
; O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
; O0-NEXT: movw %dx, {{[0-9]+}}(%rsp)
; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; O0-NEXT: tileloadd (%rsi,%rdi), %tmm0
; O0-NEXT: movl $64, %edi
; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
; O0-NEXT: movw $8, %dx
; O0-NEXT: tilestored %tmm0, (%rsi,%rdi)
; O0-NEXT: movl $32, %esi
; O0-NEXT: movl $buf+1024, %edx
; O0-NEXT: movw $8, %ax
; O0-NEXT: # implicit-def: $al
; O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp)
; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; O0-NEXT: tileloadd (%rdx,%rsi), %tmm0
; O0-NEXT: movl $64, %esi
; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
; O0-NEXT: movw $8, %ax
; O0-NEXT: tilestored %tmm0, (%rdx,%rsi)
; O0-NEXT: vzeroupper
; O0-NEXT: callq foo
; O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload
; O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
; O0-NEXT: movl $64, %edi
; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
; O0-NEXT: movw $8, %cx
; O0-NEXT: # implicit-def: $cl
; O0-NEXT: movb %cl, {{[0-9]+}}(%rsp)
; O0-NEXT: movw %dx, {{[0-9]+}}(%rsp)
; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; O0-NEXT: tileloadd (%rsi,%rdi), %tmm0
; O0-NEXT: movw $8, %cx
; O0-NEXT: tilemovrow $2, %tmm0, %zmm0
; O0-NEXT: movl $64, %esi
; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
; O0-NEXT: movw $8, %cx
; O0-NEXT: # implicit-def: $al
; O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp)
; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; O0-NEXT: tileloadd (%rdx,%rsi), %tmm0
; O0-NEXT: movw $8, %cx
; O0-NEXT: tilemovrow $2, %tmm0, %zmm1
; O0-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; O0-NEXT: movq %rbp, %rsp
; O0-NEXT: popq %rbp
; O0-NEXT: tilerelease
; O0-NEXT: retq
%3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32)
%4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32)
call void @foo()
%5 = call <16 x i32> @llvm.x86.tilemovrow.internal(i16 8, i16 %1, x86_amx %4, i32 2)
%6 = call <16 x i32> @llvm.x86.tilemovrow.internal(i16 %0, i16 8, x86_amx %3, i32 2)
%7 = add <16 x i32> %5, %6
ret <16 x i32> %7
}
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64)
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
declare <16 x i32> @llvm.x86.tilemovrow.internal(i16, i16, x86_amx, i32)