llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll - third_party/github.com/llvm/llvm-project - Git at Google

 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

 ; Interleave loads and stores to fit into 9 VGPR limit.
 ; This requires to avoid load/store clustering.

 ; GCN: global_load_dwordx4
 ; GCN: global_store_dwordx4
 ; GCN: global_load_dwordx4
 ; GCN: global_store_dwordx4
 ; GCN: global_load_dwordx4
 ; GCN: global_store_dwordx4
 ; GCN: NumVgprs: {{[0-9]$}}
 ; GCN: ScratchSize: 0{{$}}

 define amdgpu_kernel void @load_store_max_9vgprs(<4 x i32> addrspace(1)* nocapture noalias readonly %arg, <4 x i32> addrspace(1)* nocapture noalias %arg1) #1 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %base = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %id
   %tmp = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 1
   %tmp2 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp, align 4
   %tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 3
   %tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 4
   %tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 5
   %tmp6 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp5, align 4
   store <4 x i32> %tmp2, <4 x i32> addrspace(1)* %arg1, align 4
   %tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 3
   store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp7, align 4
   %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 5
   store <4 x i32> %tmp6, <4 x i32> addrspace(1)* %tmp8, align 4
   ret void
 }

 declare i32 @llvm.amdgcn.workitem.id.x() #0

 attributes #0 = { nounwind readnone }
 attributes #1 = { "amdgpu-num-vgpr"="9" }
	; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s

	; Interleave loads and stores to fit into 9 VGPR limit.
	; This requires to avoid load/store clustering.

	; GCN: global_load_dwordx4
	; GCN: global_store_dwordx4
	; GCN: global_load_dwordx4
	; GCN: global_store_dwordx4
	; GCN: global_load_dwordx4
	; GCN: global_store_dwordx4
	; GCN: NumVgprs: {{[0-9]$}}
	; GCN: ScratchSize: 0{{$}}

	define amdgpu_kernel void @load_store_max_9vgprs(<4 x i32> addrspace(1)* nocapture noalias readonly %arg, <4 x i32> addrspace(1)* nocapture noalias %arg1) #1 {
	bb:
	%id = call i32 @llvm.amdgcn.workitem.id.x()
	%base = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %id
	%tmp = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 1
	%tmp2 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp, align 4
	%tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 3
	%tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 4
	%tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 5
	%tmp6 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp5, align 4
	store <4 x i32> %tmp2, <4 x i32> addrspace(1)* %arg1, align 4
	%tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 3
	store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp7, align 4
	%tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 5
	store <4 x i32> %tmp6, <4 x i32> addrspace(1)* %tmp8, align 4
	ret void
	}

	declare i32 @llvm.amdgcn.workitem.id.x() #0

	attributes #0 = { nounwind readnone }
	attributes #1 = { "amdgpu-num-vgpr"="9" }