| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx10.2-512 -mattr=+amx-avx512 -verify-machineinstrs | FileCheck %s |
| ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx10.2-512 -mattr=+amx-avx512 -verify-machineinstrs -enable-ipra | FileCheck -check-prefix=IPRA %s |
| ; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx10.2-512 -mattr=+amx-avx512 -verify-machineinstrs | FileCheck -check-prefix=O0 %s |
| |
| @buf = dso_local global [3072 x i8] zeroinitializer, align 64 |
| |
| define internal void @foo() { |
| ; CHECK-LABEL: foo: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: retq |
| ; |
| ; IPRA-LABEL: foo: |
| ; IPRA: # %bb.0: # %entry |
| ; IPRA-NEXT: retq |
| ; |
| ; O0-LABEL: foo: |
| ; O0: # %bb.0: # %entry |
| ; O0-NEXT: retq |
| entry: |
| ret void |
| } |
| |
| define dso_local <16 x i32> @test_api(i16 signext %0, i16 signext %1) nounwind { |
| ; CHECK-LABEL: test_api: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: pushq %rbp |
| ; CHECK-NEXT: pushq %r14 |
| ; CHECK-NEXT: pushq %rbx |
| ; CHECK-NEXT: subq $2112, %rsp # imm = 0x840 |
| ; CHECK-NEXT: movl %esi, %ebx |
| ; CHECK-NEXT: movl %edi, %ebp |
| ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; CHECK-NEXT: vmovups %zmm0, (%rsp) |
| ; CHECK-NEXT: movb $1, (%rsp) |
| ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: ldtilecfg (%rsp) |
| ; CHECK-NEXT: movl $buf, %eax |
| ; CHECK-NEXT: movl $32, %ecx |
| ; CHECK-NEXT: movw $8, %r14w |
| ; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm0 |
| ; CHECK-NEXT: movabsq $64, %rax |
| ; CHECK-NEXT: tilestored %tmm0, 1088(%rsp,%rax) # 1024-byte Folded Spill |
| ; CHECK-NEXT: movl $buf+1024, %eax |
| ; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm1 |
| ; CHECK-NEXT: movabsq $64, %rax |
| ; CHECK-NEXT: tilestored %tmm1, 64(%rsp,%rax) # 1024-byte Folded Spill |
| ; CHECK-NEXT: vzeroupper |
| ; CHECK-NEXT: callq foo |
| ; CHECK-NEXT: ldtilecfg (%rsp) |
| ; CHECK-NEXT: movabsq $64, %rax |
| ; CHECK-NEXT: tileloadd 64(%rsp,%rax), %tmm1 # 1024-byte Folded Reload |
| ; CHECK-NEXT: tilemovrow $2, %tmm1, %zmm0 |
| ; CHECK-NEXT: tileloadd 1088(%rsp,%rax), %tmm0 # 1024-byte Folded Reload |
| ; CHECK-NEXT: tilemovrow $2, %tmm0, %zmm1 |
| ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 |
| ; CHECK-NEXT: addq $2112, %rsp # imm = 0x840 |
| ; CHECK-NEXT: popq %rbx |
| ; CHECK-NEXT: popq %r14 |
| ; CHECK-NEXT: popq %rbp |
| ; CHECK-NEXT: tilerelease |
| ; CHECK-NEXT: retq |
| ; |
| ; IPRA-LABEL: test_api: |
| ; IPRA: # %bb.0: |
| ; IPRA-NEXT: subq $72, %rsp |
| ; IPRA-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; IPRA-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) |
| ; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp) |
| ; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp) |
| ; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp) |
| ; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp) |
| ; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp) |
| ; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp) |
| ; IPRA-NEXT: movl $buf, %eax |
| ; IPRA-NEXT: movl $32, %ecx |
| ; IPRA-NEXT: movw $8, %dx |
| ; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm0 |
| ; IPRA-NEXT: movl $buf+1024, %eax |
| ; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm1 |
| ; IPRA-NEXT: callq foo |
| ; IPRA-NEXT: tilemovrow $2, %tmm1, %zmm0 |
| ; IPRA-NEXT: tilemovrow $2, %tmm0, %zmm1 |
| ; IPRA-NEXT: vpaddd %zmm1, %zmm0, %zmm0 |
| ; IPRA-NEXT: addq $72, %rsp |
| ; IPRA-NEXT: tilerelease |
| ; IPRA-NEXT: retq |
| ; |
| ; O0-LABEL: test_api: |
| ; O0: # %bb.0: |
| ; O0-NEXT: pushq %rbp |
| ; O0-NEXT: movq %rsp, %rbp |
| ; O0-NEXT: andq $-1024, %rsp # imm = 0xFC00 |
| ; O0-NEXT: subq $4096, %rsp # imm = 0x1000 |
| ; O0-NEXT: vpxor %xmm0, %xmm0, %xmm0 |
| ; O0-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) |
| ; O0-NEXT: movb $1, {{[0-9]+}}(%rsp) |
| ; O0-NEXT: movw %si, %cx |
| ; O0-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; O0-NEXT: movw %di, %ax |
| ; O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; O0-NEXT: movl $buf, %esi |
| ; O0-NEXT: movl $32, %edi |
| ; O0-NEXT: movw $8, %dx |
| ; O0-NEXT: # implicit-def: $al |
| ; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) |
| ; O0-NEXT: movw %dx, {{[0-9]+}}(%rsp) |
| ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) |
| ; O0-NEXT: tileloadd (%rsi,%rdi), %tmm0 |
| ; O0-NEXT: movl $64, %edi |
| ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rsi |
| ; O0-NEXT: movw $8, %dx |
| ; O0-NEXT: tilestored %tmm0, (%rsi,%rdi) |
| ; O0-NEXT: movl $32, %esi |
| ; O0-NEXT: movl $buf+1024, %edx |
| ; O0-NEXT: movw $8, %ax |
| ; O0-NEXT: # implicit-def: $al |
| ; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) |
| ; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) |
| ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) |
| ; O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 |
| ; O0-NEXT: movl $64, %esi |
| ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx |
| ; O0-NEXT: movw $8, %ax |
| ; O0-NEXT: tilestored %tmm0, (%rdx,%rsi) |
| ; O0-NEXT: vzeroupper |
| ; O0-NEXT: callq foo |
| ; O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload |
| ; O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload |
| ; O0-NEXT: movl $64, %edi |
| ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rsi |
| ; O0-NEXT: movw $8, %cx |
| ; O0-NEXT: # implicit-def: $cl |
| ; O0-NEXT: movb %cl, {{[0-9]+}}(%rsp) |
| ; O0-NEXT: movw %dx, {{[0-9]+}}(%rsp) |
| ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) |
| ; O0-NEXT: tileloadd (%rsi,%rdi), %tmm0 |
| ; O0-NEXT: movw $8, %cx |
| ; O0-NEXT: tilemovrow $2, %tmm0, %zmm0 |
| ; O0-NEXT: movl $64, %esi |
| ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx |
| ; O0-NEXT: movw $8, %cx |
| ; O0-NEXT: # implicit-def: $al |
| ; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) |
| ; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) |
| ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) |
| ; O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 |
| ; O0-NEXT: movw $8, %cx |
| ; O0-NEXT: tilemovrow $2, %tmm0, %zmm1 |
| ; O0-NEXT: vpaddd %zmm1, %zmm0, %zmm0 |
| ; O0-NEXT: movq %rbp, %rsp |
| ; O0-NEXT: popq %rbp |
| ; O0-NEXT: tilerelease |
| ; O0-NEXT: retq |
| %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32) |
| %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) |
| call void @foo() |
| %5 = call <16 x i32> @llvm.x86.tilemovrow.internal(i16 8, i16 %1, x86_amx %4, i32 2) |
| %6 = call <16 x i32> @llvm.x86.tilemovrow.internal(i16 %0, i16 8, x86_amx %3, i32 2) |
| %7 = add <16 x i32> %5, %6 |
| ret <16 x i32> %7 |
| } |
| |
| |
| declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) |
| declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) |
| declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) |
| declare <16 x i32> @llvm.x86.tilemovrow.internal(i16, i16, x86_amx, i32) |