blob: 358f42dfe8dd5ad5f423299d5db92d567d18effc [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 %s -o - | FileCheck %s
%struct.S = type { [32 x i32] }
@shared = addrspace(3) global %struct.S undef, align 4
define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 {
; CHECK-LABEL: memcpy_p0_p0_minsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: v_mov_b32_e32 v1, s3
; CHECK-NEXT: flat_load_ubyte v4, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v3, s1
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:1
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:1
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:2
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:2
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:3
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:3
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:4
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:4
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:5
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:5
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:6
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:6
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:7
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:7
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:8
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:8
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:9
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:9
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:10
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:10
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:11
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:11
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:12
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:12
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:13
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:13
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:14
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:14
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:15
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:15
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:16
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:17
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:17
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:18
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:18
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:19
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:19
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:20
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:20
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:21
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:21
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:22
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:22
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:23
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:23
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:24
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:24
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:25
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:25
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:26
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:26
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:27
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:27
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:28
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:28
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:29
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:29
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:30
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:30
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:31
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:31
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:32
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:33
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:33
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:34
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:34
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:35
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:35
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:36
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:36
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:37
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:37
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:38
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:38
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:39
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:39
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:40
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:40
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:41
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:41
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:42
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:42
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:43
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:43
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:44
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:44
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:45
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:45
; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:46
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:46
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
ret void
}
define amdgpu_kernel void @memcpy_p1_p1_minsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #0 {
; CHECK-LABEL: memcpy_p1_p1_minsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v4, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:32
; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:39
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:39
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 47, i1 false)
ret void
}
define amdgpu_kernel void @memcpy_p1_p4_minsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #0 {
; CHECK-LABEL: memcpy_p1_p4_minsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v4, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:48
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:64
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:80
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:96
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) %global, ptr addrspace(4) %0, i64 128, i1 false)
ret void
}
define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #0 {
; CHECK-LABEL: memcpy_p5_p4_minsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3]
; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1]
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_add_u32 s8, s8, s7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1]
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:1
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:2
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:3
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:4
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:5
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:6
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:7
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:8
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:9
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:10
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:11
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:12
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:13
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:14
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:15
; CHECK-NEXT: s_addc_u32 s9, s9, 0
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:16
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:17
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:18
; CHECK-NEXT: s_waitcnt vmcnt(18)
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:19
; CHECK-NEXT: s_waitcnt vmcnt(19)
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:1
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:20
; CHECK-NEXT: s_waitcnt vmcnt(20)
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:2
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:21
; CHECK-NEXT: s_waitcnt vmcnt(21)
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:3
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:22
; CHECK-NEXT: s_waitcnt vmcnt(22)
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:4
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:23
; CHECK-NEXT: s_waitcnt vmcnt(23)
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:5
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:24
; CHECK-NEXT: s_waitcnt vmcnt(24)
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:6
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:25
; CHECK-NEXT: s_waitcnt vmcnt(25)
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:7
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:26
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:8
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:27
; CHECK-NEXT: s_waitcnt vmcnt(27)
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:9
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:28
; CHECK-NEXT: s_waitcnt vmcnt(28)
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:10
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:29
; CHECK-NEXT: s_waitcnt vmcnt(29)
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:11
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:30
; CHECK-NEXT: s_waitcnt vmcnt(30)
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:12
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:31
; CHECK-NEXT: s_waitcnt vmcnt(31)
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:13
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(32)
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:14
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:33
; CHECK-NEXT: s_waitcnt vmcnt(33)
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:15
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:34
; CHECK-NEXT: s_waitcnt vmcnt(34)
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:16
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:35
; CHECK-NEXT: s_waitcnt vmcnt(35)
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:17
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:36
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:18
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:37
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:19
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:38
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:20
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:39
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:21
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:40
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:22
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:41
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:23
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:42
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:24
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:43
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:25
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:44
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:26
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:45
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:27
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:46
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:28
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:47
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:29
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:48
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:30
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:49
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:31
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:50
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:32
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:51
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:33
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:52
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:34
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:53
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:35
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:54
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:36
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:55
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:37
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:56
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:38
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:57
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:39
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:58
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:40
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:59
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:41
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:60
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:42
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:61
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:43
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:62
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:44
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:63
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:45
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:64
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:46
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:65
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:47
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:66
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:48
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:67
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:49
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:68
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:50
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:69
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:51
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:70
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:52
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:71
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:53
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:72
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:54
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:73
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:55
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:74
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:56
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:75
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:57
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:76
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:58
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:77
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:59
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:78
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:60
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:79
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:61
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:80
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:62
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:81
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:63
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:82
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:64
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:83
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:65
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:84
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:66
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:85
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:67
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:86
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:68
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:87
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:69
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:88
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:70
; CHECK-NEXT: s_waitcnt vmcnt(35)
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:71
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:89
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:90
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:72
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:91
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:73
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:92
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:74
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:93
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:75
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:94
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:76
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:95
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:77
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:96
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:78
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:97
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:79
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:98
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:80
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:99
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:81
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:100
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:82
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:101
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:83
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:102
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:84
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:103
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:85
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:104
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:86
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:105
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:87
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:106
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:88
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:107
; CHECK-NEXT: s_waitcnt vmcnt(35)
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:89
; CHECK-NEXT: s_waitcnt vmcnt(35)
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:90
; CHECK-NEXT: s_waitcnt vmcnt(34)
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:91
; CHECK-NEXT: s_waitcnt vmcnt(33)
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:92
; CHECK-NEXT: s_waitcnt vmcnt(32)
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:93
; CHECK-NEXT: s_waitcnt vmcnt(31)
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:94
; CHECK-NEXT: s_waitcnt vmcnt(30)
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:95
; CHECK-NEXT: s_waitcnt vmcnt(29)
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:96
; CHECK-NEXT: s_waitcnt vmcnt(28)
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:97
; CHECK-NEXT: s_waitcnt vmcnt(27)
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:98
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:99
; CHECK-NEXT: s_waitcnt vmcnt(25)
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:100
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:108
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:109
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:111
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:112
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:113
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:114
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:115
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:116
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:117
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:118
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:119
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:101
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:120
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:102
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:121
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:103
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:122
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:104
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:123
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:105
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:124
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:106
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:125
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:107
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:126
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:127
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:108
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:109
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:110
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:111
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:112
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:113
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:114
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:115
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:116
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:117
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:118
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:119
; CHECK-NEXT: s_waitcnt vmcnt(25)
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:120
; CHECK-NEXT: s_waitcnt vmcnt(24)
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:121
; CHECK-NEXT: s_waitcnt vmcnt(23)
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:122
; CHECK-NEXT: s_waitcnt vmcnt(22)
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:123
; CHECK-NEXT: s_waitcnt vmcnt(21)
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:124
; CHECK-NEXT: s_waitcnt vmcnt(20)
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:125
; CHECK-NEXT: s_waitcnt vmcnt(19)
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:126
; CHECK-NEXT: s_waitcnt vmcnt(19)
; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:127
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
ret void
}
define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %src) #0 {
; CHECK-LABEL: memcpy_p0_p5_minsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3]
; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1]
; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8
; CHECK-NEXT: s_add_u32 s8, s8, s7
; CHECK-NEXT: s_addc_u32 s9, s9, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:1
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:2
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:3
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:4
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:5
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:6
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:7
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:8
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:9
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:10
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:11
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:12
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:13
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:14
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:15
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:16
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:17
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: v_mov_b32_e32 v1, s1
; CHECK-NEXT: s_waitcnt vmcnt(17)
; CHECK-NEXT: flat_store_byte v[0:1], v3
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:18
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:19
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:20
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:21
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:22
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:23
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:24
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:25
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:26
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:27
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:28
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:29
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:30
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:31
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:32
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:33
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:34
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:35
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:18
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:36
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:19
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:37
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:38
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:21
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:39
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:22
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:40
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:23
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:41
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:42
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:43
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:26
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:44
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:27
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:45
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:28
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:46
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:29
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:47
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:30
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:48
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:31
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:49
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:32
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:50
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:33
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:51
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:34
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:52
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:35
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:53
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:36
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:54
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:37
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:55
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:38
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:56
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:57
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:40
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:58
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:41
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:59
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:42
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:60
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:43
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:61
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:44
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:62
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:45
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:63
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:46
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:64
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:47
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:65
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:48
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:66
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:49
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:67
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:50
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:68
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:51
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:69
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:52
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:70
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:53
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:71
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:54
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:72
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:73
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:74
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:57
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:75
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:58
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:76
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:59
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:77
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:60
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:78
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:61
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:79
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:62
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:80
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:63
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:81
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:64
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:82
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:65
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:83
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:66
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:84
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:67
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:85
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:68
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:86
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:69
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:87
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:70
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:88
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:71
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:89
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:90
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:91
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:92
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:93
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:94
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:95
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:96
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:97
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:80
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:98
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:81
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:99
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:82
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:100
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:83
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:101
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:84
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:102
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:85
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:103
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:86
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:104
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:87
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:105
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:88
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:106
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:89
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:107
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:108
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:91
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:92
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:93
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:94
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:95
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:96
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:97
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:98
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:99
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:100
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:101
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:109
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:110
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:111
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:112
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:113
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:114
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:115
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:116
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:117
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:118
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:119
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:102
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:120
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:103
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:121
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:104
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:122
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:105
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:123
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:106
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:124
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:107
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:125
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:108
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:126
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:127
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:109
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:110
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:111
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:112
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:113
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:114
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:115
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:116
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:117
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:118
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:119
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:120
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:121
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:122
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:123
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:124
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:125
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:126
; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:127
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
ret void
}
define amdgpu_kernel void @memcpy_p3_p4_minsize(ptr addrspace(4) %0) #0 {
; CHECK-LABEL: memcpy_p3_p4_minsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v24, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80
; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(5)
; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:14 offset1:15
; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:12 offset1:13
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1]
; CHECK-NEXT: s_waitcnt vmcnt(5)
; CHECK-NEXT: ds_write2_b64 v24, v[8:9], v[10:11] offset0:10 offset1:11
; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: ds_write2_b64 v24, v[12:13], v[14:15] offset0:8 offset1:9
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: ds_write2_b64 v24, v[16:17], v[18:19] offset0:6 offset1:7
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: ds_write2_b64 v24, v[20:21], v[22:23] offset0:4 offset1:5
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:2 offset1:3
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset1:1
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) @shared, ptr addrspace(4) %0, i64 128, i1 false)
ret void
}
define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
; CHECK-LABEL: memcpy_p0_p3_minsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: ds_read_u8 v3, v2 offset:127
; CHECK-NEXT: ds_read_u8 v4, v2 offset:126
; CHECK-NEXT: ds_read_u8 v5, v2 offset:125
; CHECK-NEXT: ds_read_u8 v6, v2 offset:124
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: v_mov_b32_e32 v1, s1
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:127
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:126
; CHECK-NEXT: ds_read_u8 v3, v2 offset:123
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:125
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:124
; CHECK-NEXT: ds_read_u8 v4, v2 offset:122
; CHECK-NEXT: ds_read_u8 v5, v2 offset:121
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:123
; CHECK-NEXT: ds_read_u8 v3, v2 offset:120
; CHECK-NEXT: ds_read_u8 v6, v2 offset:119
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:122
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:121
; CHECK-NEXT: ds_read_u8 v4, v2 offset:118
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:120
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:119
; CHECK-NEXT: ds_read_u8 v3, v2 offset:117
; CHECK-NEXT: ds_read_u8 v5, v2 offset:116
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:118
; CHECK-NEXT: ds_read_u8 v4, v2 offset:115
; CHECK-NEXT: ds_read_u8 v6, v2 offset:114
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:117
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:116
; CHECK-NEXT: ds_read_u8 v3, v2 offset:113
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:115
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:114
; CHECK-NEXT: ds_read_u8 v4, v2 offset:112
; CHECK-NEXT: ds_read_u8 v5, v2 offset:111
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113
; CHECK-NEXT: ds_read_u8 v3, v2 offset:110
; CHECK-NEXT: ds_read_u8 v6, v2 offset:109
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:112
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:111
; CHECK-NEXT: ds_read_u8 v4, v2 offset:108
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:110
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:109
; CHECK-NEXT: ds_read_u8 v3, v2 offset:107
; CHECK-NEXT: ds_read_u8 v5, v2 offset:106
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:108
; CHECK-NEXT: ds_read_u8 v4, v2 offset:105
; CHECK-NEXT: ds_read_u8 v6, v2 offset:104
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:107
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106
; CHECK-NEXT: ds_read_u8 v3, v2 offset:103
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:104
; CHECK-NEXT: ds_read_u8 v4, v2 offset:102
; CHECK-NEXT: ds_read_u8 v5, v2 offset:101
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:103
; CHECK-NEXT: ds_read_u8 v3, v2 offset:100
; CHECK-NEXT: ds_read_u8 v6, v2 offset:99
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:102
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:101
; CHECK-NEXT: ds_read_u8 v4, v2 offset:98
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:100
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99
; CHECK-NEXT: ds_read_u8 v3, v2 offset:97
; CHECK-NEXT: ds_read_u8 v5, v2 offset:96
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:98
; CHECK-NEXT: ds_read_u8 v4, v2 offset:95
; CHECK-NEXT: ds_read_u8 v6, v2 offset:94
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:97
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:96
; CHECK-NEXT: ds_read_u8 v3, v2 offset:93
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:95
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:94
; CHECK-NEXT: ds_read_u8 v4, v2 offset:92
; CHECK-NEXT: ds_read_u8 v5, v2 offset:91
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:93
; CHECK-NEXT: ds_read_u8 v3, v2 offset:90
; CHECK-NEXT: ds_read_u8 v6, v2 offset:89
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:92
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:91
; CHECK-NEXT: ds_read_u8 v4, v2 offset:88
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:89
; CHECK-NEXT: ds_read_u8 v3, v2 offset:87
; CHECK-NEXT: ds_read_u8 v5, v2 offset:86
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:88
; CHECK-NEXT: ds_read_u8 v4, v2 offset:85
; CHECK-NEXT: ds_read_u8 v6, v2 offset:84
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:87
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:86
; CHECK-NEXT: ds_read_u8 v3, v2 offset:83
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:85
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:84
; CHECK-NEXT: ds_read_u8 v4, v2 offset:82
; CHECK-NEXT: ds_read_u8 v5, v2 offset:81
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:83
; CHECK-NEXT: ds_read_u8 v3, v2 offset:80
; CHECK-NEXT: ds_read_u8 v6, v2 offset:79
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:82
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:81
; CHECK-NEXT: ds_read_u8 v4, v2 offset:78
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:80
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:79
; CHECK-NEXT: ds_read_u8 v3, v2 offset:77
; CHECK-NEXT: ds_read_u8 v5, v2 offset:76
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:78
; CHECK-NEXT: ds_read_u8 v4, v2 offset:75
; CHECK-NEXT: ds_read_u8 v6, v2 offset:74
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:77
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:76
; CHECK-NEXT: ds_read_u8 v3, v2 offset:73
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:75
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:74
; CHECK-NEXT: ds_read_u8 v4, v2 offset:72
; CHECK-NEXT: ds_read_u8 v5, v2 offset:71
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:73
; CHECK-NEXT: ds_read_u8 v3, v2 offset:70
; CHECK-NEXT: ds_read_u8 v6, v2 offset:69
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:72
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:71
; CHECK-NEXT: ds_read_u8 v4, v2 offset:68
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:70
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:69
; CHECK-NEXT: ds_read_u8 v3, v2 offset:67
; CHECK-NEXT: ds_read_u8 v5, v2 offset:66
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:68
; CHECK-NEXT: ds_read_u8 v4, v2 offset:65
; CHECK-NEXT: ds_read_u8 v6, v2 offset:64
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:67
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66
; CHECK-NEXT: ds_read_u8 v3, v2 offset:63
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:64
; CHECK-NEXT: ds_read_u8 v4, v2 offset:62
; CHECK-NEXT: ds_read_u8 v5, v2 offset:61
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:63
; CHECK-NEXT: ds_read_u8 v3, v2 offset:60
; CHECK-NEXT: ds_read_u8 v6, v2 offset:59
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:62
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:61
; CHECK-NEXT: ds_read_u8 v4, v2 offset:58
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:60
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:59
; CHECK-NEXT: ds_read_u8 v3, v2 offset:57
; CHECK-NEXT: ds_read_u8 v5, v2 offset:56
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:58
; CHECK-NEXT: ds_read_u8 v4, v2 offset:55
; CHECK-NEXT: ds_read_u8 v6, v2 offset:54
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56
; CHECK-NEXT: ds_read_u8 v3, v2 offset:53
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54
; CHECK-NEXT: ds_read_u8 v4, v2 offset:52
; CHECK-NEXT: ds_read_u8 v5, v2 offset:51
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:53
; CHECK-NEXT: ds_read_u8 v3, v2 offset:50
; CHECK-NEXT: ds_read_u8 v6, v2 offset:49
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:52
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:51
; CHECK-NEXT: ds_read_u8 v4, v2 offset:48
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:50
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:49
; CHECK-NEXT: ds_read_u8 v3, v2 offset:47
; CHECK-NEXT: ds_read_u8 v5, v2 offset:46
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:48
; CHECK-NEXT: ds_read_u8 v4, v2 offset:45
; CHECK-NEXT: ds_read_u8 v6, v2 offset:44
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:47
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:46
; CHECK-NEXT: ds_read_u8 v3, v2 offset:43
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:45
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:44
; CHECK-NEXT: ds_read_u8 v4, v2 offset:42
; CHECK-NEXT: ds_read_u8 v5, v2 offset:41
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43
; CHECK-NEXT: ds_read_u8 v3, v2 offset:40
; CHECK-NEXT: ds_read_u8 v6, v2 offset:39
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41
; CHECK-NEXT: ds_read_u8 v4, v2 offset:38
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:40
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39
; CHECK-NEXT: ds_read_u8 v3, v2 offset:37
; CHECK-NEXT: ds_read_u8 v5, v2 offset:36
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:38
; CHECK-NEXT: ds_read_u8 v4, v2 offset:35
; CHECK-NEXT: ds_read_u8 v6, v2 offset:34
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:37
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:36
; CHECK-NEXT: ds_read_u8 v3, v2 offset:33
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:35
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:34
; CHECK-NEXT: ds_read_u8 v4, v2 offset:32
; CHECK-NEXT: ds_read_u8 v5, v2 offset:31
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:33
; CHECK-NEXT: ds_read_u8 v3, v2 offset:30
; CHECK-NEXT: ds_read_u8 v6, v2 offset:29
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:32
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:31
; CHECK-NEXT: ds_read_u8 v4, v2 offset:28
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29
; CHECK-NEXT: ds_read_u8 v3, v2 offset:27
; CHECK-NEXT: ds_read_u8 v5, v2 offset:26
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28
; CHECK-NEXT: ds_read_u8 v4, v2 offset:25
; CHECK-NEXT: ds_read_u8 v6, v2 offset:24
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:27
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:26
; CHECK-NEXT: ds_read_u8 v3, v2 offset:23
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:25
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:24
; CHECK-NEXT: ds_read_u8 v4, v2 offset:22
; CHECK-NEXT: ds_read_u8 v5, v2 offset:21
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:23
; CHECK-NEXT: ds_read_u8 v3, v2 offset:20
; CHECK-NEXT: ds_read_u8 v6, v2 offset:19
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:21
; CHECK-NEXT: ds_read_u8 v4, v2 offset:18
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:20
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:19
; CHECK-NEXT: ds_read_u8 v3, v2 offset:16
; CHECK-NEXT: ds_read_u8 v5, v2 offset:17
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:18
; CHECK-NEXT: ds_read_u8 v4, v2 offset:8
; CHECK-NEXT: ds_read_u8 v6, v2 offset:9
; CHECK-NEXT: ds_read_u8 v7, v2 offset:10
; CHECK-NEXT: ds_read_u8 v8, v2 offset:11
; CHECK-NEXT: ds_read_u8 v9, v2 offset:12
; CHECK-NEXT: ds_read_u8 v10, v2 offset:13
; CHECK-NEXT: ds_read_u8 v11, v2 offset:14
; CHECK-NEXT: ds_read_u8 v12, v2 offset:15
; CHECK-NEXT: ds_read_u8 v13, v2
; CHECK-NEXT: ds_read_u8 v14, v2 offset:1
; CHECK-NEXT: ds_read_u8 v15, v2 offset:2
; CHECK-NEXT: ds_read_u8 v16, v2 offset:3
; CHECK-NEXT: ds_read_u8 v17, v2 offset:4
; CHECK-NEXT: ds_read_u8 v18, v2 offset:5
; CHECK-NEXT: ds_read_u8 v19, v2 offset:6
; CHECK-NEXT: ds_read_u8 v2, v2 offset:7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:17
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:15
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:14
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:13
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:12
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:10
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:9
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:8
; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:6
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:5
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:4
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:2
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:1
; CHECK-NEXT: flat_store_byte v[0:1], v13
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
ret void
}
define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
; CHECK-LABEL: memcpy_p0_p0_optsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: v_mov_b32_e32 v1, s3
; CHECK-NEXT: flat_load_ubyte v4, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v3, s1
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:1
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:1
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:2
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:2
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:3
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:3
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:4
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:4
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:5
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:5
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:6
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:6
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:7
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:7
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:8
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:8
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:9
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:9
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:10
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:10
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:11
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:11
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:12
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:12
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:13
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:13
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:14
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:14
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:15
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:15
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:16
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:17
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:17
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:18
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:18
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:19
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:19
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:20
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:20
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:21
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:21
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:22
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:22
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:23
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:23
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:24
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:24
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:25
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:25
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:26
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:26
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:27
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:27
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:28
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:28
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:29
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:29
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:30
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:30
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:31
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:31
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:32
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:33
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:33
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:34
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:34
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:35
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:35
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:36
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:36
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:37
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:37
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:38
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:38
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:39
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:39
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:40
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:40
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:41
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:41
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:42
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:42
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:43
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:43
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:44
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:44
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:45
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:45
; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:46
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:46
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
ret void
}
define amdgpu_kernel void @memcpy_p1_p1_optsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #1 {
; CHECK-LABEL: memcpy_p1_p1_optsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v4, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:32
; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:39
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:39
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 47, i1 false)
ret void
}
define amdgpu_kernel void @memcpy_p1_p4_optsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #1 {
; CHECK-LABEL: memcpy_p1_p4_optsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v4, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:48
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:64
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:80
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:96
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) %global, ptr addrspace(4) %0, i64 128, i1 false)
ret void
}
define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #1 {
; CHECK-LABEL: memcpy_p5_p4_optsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3]
; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1]
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_add_u32 s8, s8, s7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1]
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:1
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:2
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:3
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:4
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:5
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:6
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:7
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:8
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:9
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:10
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:11
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:12
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:13
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:14
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:15
; CHECK-NEXT: s_addc_u32 s9, s9, 0
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:16
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:17
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:18
; CHECK-NEXT: s_waitcnt vmcnt(18)
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:19
; CHECK-NEXT: s_waitcnt vmcnt(19)
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:1
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:20
; CHECK-NEXT: s_waitcnt vmcnt(20)
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:2
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:21
; CHECK-NEXT: s_waitcnt vmcnt(21)
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:3
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:22
; CHECK-NEXT: s_waitcnt vmcnt(22)
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:4
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:23
; CHECK-NEXT: s_waitcnt vmcnt(23)
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:5
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:24
; CHECK-NEXT: s_waitcnt vmcnt(24)
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:6
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:25
; CHECK-NEXT: s_waitcnt vmcnt(25)
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:7
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:26
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:8
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:27
; CHECK-NEXT: s_waitcnt vmcnt(27)
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:9
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:28
; CHECK-NEXT: s_waitcnt vmcnt(28)
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:10
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:29
; CHECK-NEXT: s_waitcnt vmcnt(29)
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:11
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:30
; CHECK-NEXT: s_waitcnt vmcnt(30)
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:12
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:31
; CHECK-NEXT: s_waitcnt vmcnt(31)
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:13
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(32)
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:14
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:33
; CHECK-NEXT: s_waitcnt vmcnt(33)
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:15
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:34
; CHECK-NEXT: s_waitcnt vmcnt(34)
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:16
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:35
; CHECK-NEXT: s_waitcnt vmcnt(35)
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:17
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:36
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:18
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:37
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:19
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:38
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:20
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:39
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:21
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:40
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:22
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:41
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:23
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:42
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:24
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:43
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:25
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:44
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:26
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:45
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:27
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:46
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:28
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:47
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:29
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:48
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:30
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:49
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:31
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:50
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:32
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:51
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:33
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:52
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:34
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:53
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:35
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:54
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:36
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:55
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:37
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:56
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:38
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:57
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:39
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:58
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:40
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:59
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:41
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:60
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:42
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:61
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:43
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:62
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:44
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:63
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:45
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:64
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:46
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:65
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:47
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:66
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:48
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:67
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:49
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:68
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:50
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:69
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:51
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:70
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:52
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:71
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:53
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:72
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:54
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:73
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:55
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:74
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:56
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:75
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:57
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:76
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:58
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:77
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:59
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:78
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:60
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:79
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:61
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:80
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:62
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:81
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:63
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:82
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:64
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:83
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:65
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:84
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:66
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:85
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:67
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:86
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:68
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:87
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:69
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:88
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:70
; CHECK-NEXT: s_waitcnt vmcnt(35)
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:71
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:89
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:90
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:72
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:91
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:73
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:92
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:74
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:93
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:75
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:94
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:76
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:95
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:77
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:96
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:78
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:97
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:79
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:98
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:80
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:99
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:81
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:100
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:82
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:101
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:83
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:102
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:84
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:103
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:85
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:104
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:86
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:105
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:87
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:106
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:88
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:107
; CHECK-NEXT: s_waitcnt vmcnt(35)
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:89
; CHECK-NEXT: s_waitcnt vmcnt(35)
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:90
; CHECK-NEXT: s_waitcnt vmcnt(34)
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:91
; CHECK-NEXT: s_waitcnt vmcnt(33)
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:92
; CHECK-NEXT: s_waitcnt vmcnt(32)
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:93
; CHECK-NEXT: s_waitcnt vmcnt(31)
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:94
; CHECK-NEXT: s_waitcnt vmcnt(30)
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:95
; CHECK-NEXT: s_waitcnt vmcnt(29)
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:96
; CHECK-NEXT: s_waitcnt vmcnt(28)
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:97
; CHECK-NEXT: s_waitcnt vmcnt(27)
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:98
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:99
; CHECK-NEXT: s_waitcnt vmcnt(25)
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:100
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:108
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:109
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:111
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:112
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:113
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:114
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:115
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:116
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:117
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:118
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:119
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:101
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:120
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:102
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:121
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:103
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:122
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:104
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:123
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:105
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:124
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:106
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:125
; CHECK-NEXT: s_waitcnt vmcnt(36)
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:107
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:126
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:127
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:108
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:109
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:110
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:111
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:112
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:113
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:114
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:115
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:116
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:117
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:118
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:119
; CHECK-NEXT: s_waitcnt vmcnt(25)
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:120
; CHECK-NEXT: s_waitcnt vmcnt(24)
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:121
; CHECK-NEXT: s_waitcnt vmcnt(23)
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:122
; CHECK-NEXT: s_waitcnt vmcnt(22)
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:123
; CHECK-NEXT: s_waitcnt vmcnt(21)
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:124
; CHECK-NEXT: s_waitcnt vmcnt(20)
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:125
; CHECK-NEXT: s_waitcnt vmcnt(19)
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:126
; CHECK-NEXT: s_waitcnt vmcnt(19)
; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:127
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
ret void
}
define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %src) #1 {
; CHECK-LABEL: memcpy_p0_p5_optsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3]
; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1]
; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8
; CHECK-NEXT: s_add_u32 s8, s8, s7
; CHECK-NEXT: s_addc_u32 s9, s9, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:1
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:2
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:3
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:4
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:5
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:6
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:7
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:8
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:9
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:10
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:11
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:12
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:13
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:14
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:15
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:16
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:17
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: v_mov_b32_e32 v1, s1
; CHECK-NEXT: s_waitcnt vmcnt(17)
; CHECK-NEXT: flat_store_byte v[0:1], v3
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:18
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:19
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:20
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:21
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:22
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:23
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:24
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:25
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:26
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:27
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:28
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:29
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:30
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:31
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:32
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:33
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:34
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:35
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:18
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:36
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:19
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:37
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:38
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:21
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:39
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:22
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:40
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:23
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:41
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:42
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:43
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:26
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:44
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:27
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:45
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:28
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:46
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:29
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:47
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:30
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:48
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:31
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:49
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:32
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:50
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:33
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:51
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:34
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:52
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:35
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:53
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:36
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:54
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:37
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:55
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:38
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:56
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:57
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:40
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:58
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:41
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:59
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:42
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:60
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:43
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:61
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:44
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:62
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:45
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:63
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:46
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:64
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:47
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:65
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:48
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:66
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:49
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:67
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:50
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:68
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:51
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:69
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:52
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:70
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:53
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:71
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:54
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:72
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:73
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:74
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:57
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:75
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:58
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:76
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:59
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:77
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:60
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:78
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:61
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:79
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:62
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:80
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:63
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:81
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:64
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:82
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:65
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:83
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:66
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:84
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:67
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:85
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:68
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:86
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:69
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:87
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:70
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:88
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:71
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:89
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:90
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:91
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:92
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:93
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:94
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:95
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:96
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:97
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:80
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:98
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:81
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:99
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:82
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:100
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:83
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:101
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:84
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:102
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:85
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:103
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:86
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:104
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:87
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:105
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:88
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:106
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:89
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:107
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:108
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:91
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:92
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:93
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:94
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:95
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:96
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:97
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:98
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:99
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:100
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:101
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:109
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:110
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:111
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:112
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:113
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:114
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:115
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:116
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:117
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:118
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:119
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:102
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:120
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:103
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:121
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:104
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:122
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:105
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:123
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:106
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:124
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:107
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:125
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:108
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:126
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:127
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:109
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:110
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:111
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:112
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:113
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:114
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:115
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:116
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:117
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:118
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:119
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:120
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:121
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:122
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:123
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:124
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:125
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:126
; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:127
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
ret void
}
define amdgpu_kernel void @memcpy_p3_p4_optsize(ptr addrspace(4) %0) #1 {
; CHECK-LABEL: memcpy_p3_p4_optsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v24, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80
; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(5)
; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:14 offset1:15
; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:12 offset1:13
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1]
; CHECK-NEXT: s_waitcnt vmcnt(5)
; CHECK-NEXT: ds_write2_b64 v24, v[8:9], v[10:11] offset0:10 offset1:11
; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: ds_write2_b64 v24, v[12:13], v[14:15] offset0:8 offset1:9
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: ds_write2_b64 v24, v[16:17], v[18:19] offset0:6 offset1:7
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: ds_write2_b64 v24, v[20:21], v[22:23] offset0:4 offset1:5
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:2 offset1:3
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset1:1
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) @shared, ptr addrspace(4) %0, i64 128, i1 false)
ret void
}
define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
; CHECK-LABEL: memcpy_p0_p3_optsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: ds_read_u8 v3, v2 offset:127
; CHECK-NEXT: ds_read_u8 v4, v2 offset:126
; CHECK-NEXT: ds_read_u8 v5, v2 offset:125
; CHECK-NEXT: ds_read_u8 v6, v2 offset:124
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: v_mov_b32_e32 v1, s1
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:127
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:126
; CHECK-NEXT: ds_read_u8 v3, v2 offset:123
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:125
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:124
; CHECK-NEXT: ds_read_u8 v4, v2 offset:122
; CHECK-NEXT: ds_read_u8 v5, v2 offset:121
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:123
; CHECK-NEXT: ds_read_u8 v3, v2 offset:120
; CHECK-NEXT: ds_read_u8 v6, v2 offset:119
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:122
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:121
; CHECK-NEXT: ds_read_u8 v4, v2 offset:118
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:120
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:119
; CHECK-NEXT: ds_read_u8 v3, v2 offset:117
; CHECK-NEXT: ds_read_u8 v5, v2 offset:116
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:118
; CHECK-NEXT: ds_read_u8 v4, v2 offset:115
; CHECK-NEXT: ds_read_u8 v6, v2 offset:114
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:117
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:116
; CHECK-NEXT: ds_read_u8 v3, v2 offset:113
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:115
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:114
; CHECK-NEXT: ds_read_u8 v4, v2 offset:112
; CHECK-NEXT: ds_read_u8 v5, v2 offset:111
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113
; CHECK-NEXT: ds_read_u8 v3, v2 offset:110
; CHECK-NEXT: ds_read_u8 v6, v2 offset:109
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:112
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:111
; CHECK-NEXT: ds_read_u8 v4, v2 offset:108
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:110
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:109
; CHECK-NEXT: ds_read_u8 v3, v2 offset:107
; CHECK-NEXT: ds_read_u8 v5, v2 offset:106
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:108
; CHECK-NEXT: ds_read_u8 v4, v2 offset:105
; CHECK-NEXT: ds_read_u8 v6, v2 offset:104
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:107
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106
; CHECK-NEXT: ds_read_u8 v3, v2 offset:103
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:104
; CHECK-NEXT: ds_read_u8 v4, v2 offset:102
; CHECK-NEXT: ds_read_u8 v5, v2 offset:101
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:103
; CHECK-NEXT: ds_read_u8 v3, v2 offset:100
; CHECK-NEXT: ds_read_u8 v6, v2 offset:99
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:102
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:101
; CHECK-NEXT: ds_read_u8 v4, v2 offset:98
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:100
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99
; CHECK-NEXT: ds_read_u8 v3, v2 offset:97
; CHECK-NEXT: ds_read_u8 v5, v2 offset:96
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:98
; CHECK-NEXT: ds_read_u8 v4, v2 offset:95
; CHECK-NEXT: ds_read_u8 v6, v2 offset:94
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:97
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:96
; CHECK-NEXT: ds_read_u8 v3, v2 offset:93
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:95
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:94
; CHECK-NEXT: ds_read_u8 v4, v2 offset:92
; CHECK-NEXT: ds_read_u8 v5, v2 offset:91
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:93
; CHECK-NEXT: ds_read_u8 v3, v2 offset:90
; CHECK-NEXT: ds_read_u8 v6, v2 offset:89
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:92
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:91
; CHECK-NEXT: ds_read_u8 v4, v2 offset:88
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:89
; CHECK-NEXT: ds_read_u8 v3, v2 offset:87
; CHECK-NEXT: ds_read_u8 v5, v2 offset:86
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:88
; CHECK-NEXT: ds_read_u8 v4, v2 offset:85
; CHECK-NEXT: ds_read_u8 v6, v2 offset:84
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:87
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:86
; CHECK-NEXT: ds_read_u8 v3, v2 offset:83
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:85
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:84
; CHECK-NEXT: ds_read_u8 v4, v2 offset:82
; CHECK-NEXT: ds_read_u8 v5, v2 offset:81
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:83
; CHECK-NEXT: ds_read_u8 v3, v2 offset:80
; CHECK-NEXT: ds_read_u8 v6, v2 offset:79
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:82
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:81
; CHECK-NEXT: ds_read_u8 v4, v2 offset:78
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:80
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:79
; CHECK-NEXT: ds_read_u8 v3, v2 offset:77
; CHECK-NEXT: ds_read_u8 v5, v2 offset:76
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:78
; CHECK-NEXT: ds_read_u8 v4, v2 offset:75
; CHECK-NEXT: ds_read_u8 v6, v2 offset:74
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:77
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:76
; CHECK-NEXT: ds_read_u8 v3, v2 offset:73
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:75
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:74
; CHECK-NEXT: ds_read_u8 v4, v2 offset:72
; CHECK-NEXT: ds_read_u8 v5, v2 offset:71
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:73
; CHECK-NEXT: ds_read_u8 v3, v2 offset:70
; CHECK-NEXT: ds_read_u8 v6, v2 offset:69
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:72
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:71
; CHECK-NEXT: ds_read_u8 v4, v2 offset:68
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:70
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:69
; CHECK-NEXT: ds_read_u8 v3, v2 offset:67
; CHECK-NEXT: ds_read_u8 v5, v2 offset:66
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:68
; CHECK-NEXT: ds_read_u8 v4, v2 offset:65
; CHECK-NEXT: ds_read_u8 v6, v2 offset:64
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:67
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66
; CHECK-NEXT: ds_read_u8 v3, v2 offset:63
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:64
; CHECK-NEXT: ds_read_u8 v4, v2 offset:62
; CHECK-NEXT: ds_read_u8 v5, v2 offset:61
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:63
; CHECK-NEXT: ds_read_u8 v3, v2 offset:60
; CHECK-NEXT: ds_read_u8 v6, v2 offset:59
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:62
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:61
; CHECK-NEXT: ds_read_u8 v4, v2 offset:58
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:60
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:59
; CHECK-NEXT: ds_read_u8 v3, v2 offset:57
; CHECK-NEXT: ds_read_u8 v5, v2 offset:56
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:58
; CHECK-NEXT: ds_read_u8 v4, v2 offset:55
; CHECK-NEXT: ds_read_u8 v6, v2 offset:54
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56
; CHECK-NEXT: ds_read_u8 v3, v2 offset:53
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54
; CHECK-NEXT: ds_read_u8 v4, v2 offset:52
; CHECK-NEXT: ds_read_u8 v5, v2 offset:51
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:53
; CHECK-NEXT: ds_read_u8 v3, v2 offset:50
; CHECK-NEXT: ds_read_u8 v6, v2 offset:49
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:52
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:51
; CHECK-NEXT: ds_read_u8 v4, v2 offset:48
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:50
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:49
; CHECK-NEXT: ds_read_u8 v3, v2 offset:47
; CHECK-NEXT: ds_read_u8 v5, v2 offset:46
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:48
; CHECK-NEXT: ds_read_u8 v4, v2 offset:45
; CHECK-NEXT: ds_read_u8 v6, v2 offset:44
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:47
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:46
; CHECK-NEXT: ds_read_u8 v3, v2 offset:43
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:45
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:44
; CHECK-NEXT: ds_read_u8 v4, v2 offset:42
; CHECK-NEXT: ds_read_u8 v5, v2 offset:41
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43
; CHECK-NEXT: ds_read_u8 v3, v2 offset:40
; CHECK-NEXT: ds_read_u8 v6, v2 offset:39
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41
; CHECK-NEXT: ds_read_u8 v4, v2 offset:38
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:40
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39
; CHECK-NEXT: ds_read_u8 v3, v2 offset:37
; CHECK-NEXT: ds_read_u8 v5, v2 offset:36
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:38
; CHECK-NEXT: ds_read_u8 v4, v2 offset:35
; CHECK-NEXT: ds_read_u8 v6, v2 offset:34
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:37
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:36
; CHECK-NEXT: ds_read_u8 v3, v2 offset:33
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:35
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:34
; CHECK-NEXT: ds_read_u8 v4, v2 offset:32
; CHECK-NEXT: ds_read_u8 v5, v2 offset:31
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:33
; CHECK-NEXT: ds_read_u8 v3, v2 offset:30
; CHECK-NEXT: ds_read_u8 v6, v2 offset:29
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:32
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:31
; CHECK-NEXT: ds_read_u8 v4, v2 offset:28
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29
; CHECK-NEXT: ds_read_u8 v3, v2 offset:27
; CHECK-NEXT: ds_read_u8 v5, v2 offset:26
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28
; CHECK-NEXT: ds_read_u8 v4, v2 offset:25
; CHECK-NEXT: ds_read_u8 v6, v2 offset:24
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:27
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:26
; CHECK-NEXT: ds_read_u8 v3, v2 offset:23
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:25
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:24
; CHECK-NEXT: ds_read_u8 v4, v2 offset:22
; CHECK-NEXT: ds_read_u8 v5, v2 offset:21
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:23
; CHECK-NEXT: ds_read_u8 v3, v2 offset:20
; CHECK-NEXT: ds_read_u8 v6, v2 offset:19
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:21
; CHECK-NEXT: ds_read_u8 v4, v2 offset:18
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:20
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:19
; CHECK-NEXT: ds_read_u8 v3, v2 offset:16
; CHECK-NEXT: ds_read_u8 v5, v2 offset:17
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:18
; CHECK-NEXT: ds_read_u8 v4, v2 offset:8
; CHECK-NEXT: ds_read_u8 v6, v2 offset:9
; CHECK-NEXT: ds_read_u8 v7, v2 offset:10
; CHECK-NEXT: ds_read_u8 v8, v2 offset:11
; CHECK-NEXT: ds_read_u8 v9, v2 offset:12
; CHECK-NEXT: ds_read_u8 v10, v2 offset:13
; CHECK-NEXT: ds_read_u8 v11, v2 offset:14
; CHECK-NEXT: ds_read_u8 v12, v2 offset:15
; CHECK-NEXT: ds_read_u8 v13, v2
; CHECK-NEXT: ds_read_u8 v14, v2 offset:1
; CHECK-NEXT: ds_read_u8 v15, v2 offset:2
; CHECK-NEXT: ds_read_u8 v16, v2 offset:3
; CHECK-NEXT: ds_read_u8 v17, v2 offset:4
; CHECK-NEXT: ds_read_u8 v18, v2 offset:5
; CHECK-NEXT: ds_read_u8 v19, v2 offset:6
; CHECK-NEXT: ds_read_u8 v2, v2 offset:7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:17
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:15
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:14
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:13
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:12
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:10
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:9
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:8
; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:6
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:5
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:4
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:2
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:1
; CHECK-NEXT: flat_store_byte v[0:1], v13
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
ret void
}
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p0.p5.i64(ptr noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p0.p3.i64(ptr noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #2
attributes #0 = { minsize }
attributes #1 = { optsize }
attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }