| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 %s -o - | FileCheck %s |
| |
| %struct.S = type { [32 x i32] } |
| |
| @shared = addrspace(3) global %struct.S undef, align 4 |
| |
| define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 { |
| ; CHECK-LABEL: memcpy_p0_p0_minsize: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: v_mov_b32_e32 v0, s2 |
| ; CHECK-NEXT: v_mov_b32_e32 v1, s3 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] |
| ; CHECK-NEXT: v_mov_b32_e32 v3, s1 |
| ; CHECK-NEXT: v_mov_b32_e32 v2, s0 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:1 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:1 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:2 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:2 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:3 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:3 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:4 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:4 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:5 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:5 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:6 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:6 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:7 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:8 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:8 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:9 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:9 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:10 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:10 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:11 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:11 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:12 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:12 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:13 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:13 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:14 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:14 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:15 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:15 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:16 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:16 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:17 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:17 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:18 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:18 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:19 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:19 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:20 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:20 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:21 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:21 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:22 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:22 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:23 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:23 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:24 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:24 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:25 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:25 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:26 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:26 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:27 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:27 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:28 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:28 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:29 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:29 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:30 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:30 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:31 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:31 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:32 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:32 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:33 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:33 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:34 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:34 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:35 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:35 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:36 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:36 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:37 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:37 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:38 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:38 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:39 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:39 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:40 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:40 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:41 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:41 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:42 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:42 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:43 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:43 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:44 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:44 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:45 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:45 |
| ; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:46 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:46 |
| ; CHECK-NEXT: s_endpgm |
| entry: |
| tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false) |
| ret void |
| } |
| |
| define amdgpu_kernel void @memcpy_p1_p1_minsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #0 { |
| ; CHECK-LABEL: memcpy_p1_p1_minsize: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 |
| ; CHECK-NEXT: v_mov_b32_e32 v4, 0 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:32 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:32 |
| ; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:39 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:39 |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 |
| ; CHECK-NEXT: s_endpgm |
| entry: |
| tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 47, i1 false) |
| ret void |
| } |
| |
| define amdgpu_kernel void @memcpy_p1_p4_minsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #0 { |
| ; CHECK-LABEL: memcpy_p1_p4_minsize: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 |
| ; CHECK-NEXT: v_mov_b32_e32 v4, 0 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:32 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:48 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:64 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:80 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:96 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:112 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 |
| ; CHECK-NEXT: s_endpgm |
| entry: |
| tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) %global, ptr addrspace(4) %0, i64 128, i1 false) |
| ret void |
| } |
| |
| define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #0 { |
| ; CHECK-LABEL: memcpy_p5_p4_minsize: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3] |
| ; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1] |
| ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 |
| ; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0 |
| ; CHECK-NEXT: s_add_u32 s8, s8, s7 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] |
| ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:1 |
| ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:2 |
| ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:3 |
| ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:4 |
| ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:5 |
| ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:6 |
| ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:7 |
| ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:8 |
| ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:9 |
| ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:10 |
| ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:11 |
| ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:12 |
| ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:13 |
| ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:14 |
| ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:15 |
| ; CHECK-NEXT: s_addc_u32 s9, s9, 0 |
| ; CHECK-NEXT: v_mov_b32_e32 v1, s2 |
| ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:16 |
| ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:17 |
| ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:18 |
| ; CHECK-NEXT: s_waitcnt vmcnt(18) |
| ; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen |
| ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:19 |
| ; CHECK-NEXT: s_waitcnt vmcnt(19) |
| ; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:1 |
| ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:20 |
| ; CHECK-NEXT: s_waitcnt vmcnt(20) |
| ; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:2 |
| ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:21 |
| ; CHECK-NEXT: s_waitcnt vmcnt(21) |
| ; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:3 |
| ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:22 |
| ; CHECK-NEXT: s_waitcnt vmcnt(22) |
| ; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:4 |
| ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:23 |
| ; CHECK-NEXT: s_waitcnt vmcnt(23) |
| ; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:5 |
| ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:24 |
| ; CHECK-NEXT: s_waitcnt vmcnt(24) |
| ; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:6 |
| ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:25 |
| ; CHECK-NEXT: s_waitcnt vmcnt(25) |
| ; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:7 |
| ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:26 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:8 |
| ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:27 |
| ; CHECK-NEXT: s_waitcnt vmcnt(27) |
| ; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:9 |
| ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:28 |
| ; CHECK-NEXT: s_waitcnt vmcnt(28) |
| ; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:10 |
| ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:29 |
| ; CHECK-NEXT: s_waitcnt vmcnt(29) |
| ; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:11 |
| ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:30 |
| ; CHECK-NEXT: s_waitcnt vmcnt(30) |
| ; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:12 |
| ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:31 |
| ; CHECK-NEXT: s_waitcnt vmcnt(31) |
| ; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:13 |
| ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:32 |
| ; CHECK-NEXT: s_waitcnt vmcnt(32) |
| ; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:14 |
| ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:33 |
| ; CHECK-NEXT: s_waitcnt vmcnt(33) |
| ; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:15 |
| ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:34 |
| ; CHECK-NEXT: s_waitcnt vmcnt(34) |
| ; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:16 |
| ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:35 |
| ; CHECK-NEXT: s_waitcnt vmcnt(35) |
| ; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:17 |
| ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:36 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:18 |
| ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:37 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:19 |
| ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:38 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:20 |
| ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:39 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:21 |
| ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:40 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:22 |
| ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:41 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:23 |
| ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:42 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:24 |
| ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:43 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:25 |
| ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:44 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:26 |
| ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:45 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:27 |
| ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:46 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:28 |
| ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:47 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:29 |
| ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:48 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:30 |
| ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:49 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:31 |
| ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:50 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:32 |
| ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:51 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:33 |
| ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:52 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:34 |
| ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:53 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:35 |
| ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:54 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:36 |
| ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:55 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:37 |
| ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:56 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:38 |
| ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:57 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:39 |
| ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:58 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:40 |
| ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:59 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:41 |
| ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:60 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:42 |
| ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:61 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:43 |
| ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:62 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:44 |
| ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:63 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:45 |
| ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:64 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:46 |
| ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:65 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:47 |
| ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:66 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:48 |
| ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:67 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:49 |
| ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:68 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:50 |
| ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:69 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:51 |
| ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:70 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:52 |
| ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:71 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:53 |
| ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:72 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:54 |
| ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:73 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:55 |
| ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:74 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:56 |
| ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:75 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:57 |
| ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:76 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:58 |
| ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:77 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:59 |
| ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:78 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:60 |
| ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:79 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:61 |
| ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:80 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:62 |
| ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:81 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:63 |
| ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:82 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:64 |
| ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:83 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:65 |
| ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:84 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:66 |
| ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:85 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:67 |
| ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:86 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:68 |
| ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:87 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:69 |
| ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:88 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:70 |
| ; CHECK-NEXT: s_waitcnt vmcnt(35) |
| ; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:71 |
| ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:89 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:90 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:72 |
| ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:91 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:73 |
| ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:92 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:74 |
| ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:93 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:75 |
| ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:94 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:76 |
| ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:95 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:77 |
| ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:96 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:78 |
| ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:97 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:79 |
| ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:98 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:80 |
| ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:99 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:81 |
| ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:100 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:82 |
| ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:101 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:83 |
| ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:102 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:84 |
| ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:103 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:85 |
| ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:104 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:86 |
| ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:105 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:87 |
| ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:106 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:88 |
| ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:107 |
| ; CHECK-NEXT: s_waitcnt vmcnt(35) |
| ; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:89 |
| ; CHECK-NEXT: s_waitcnt vmcnt(35) |
| ; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:90 |
| ; CHECK-NEXT: s_waitcnt vmcnt(34) |
| ; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:91 |
| ; CHECK-NEXT: s_waitcnt vmcnt(33) |
| ; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:92 |
| ; CHECK-NEXT: s_waitcnt vmcnt(32) |
| ; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:93 |
| ; CHECK-NEXT: s_waitcnt vmcnt(31) |
| ; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:94 |
| ; CHECK-NEXT: s_waitcnt vmcnt(30) |
| ; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:95 |
| ; CHECK-NEXT: s_waitcnt vmcnt(29) |
| ; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:96 |
| ; CHECK-NEXT: s_waitcnt vmcnt(28) |
| ; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:97 |
| ; CHECK-NEXT: s_waitcnt vmcnt(27) |
| ; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:98 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:99 |
| ; CHECK-NEXT: s_waitcnt vmcnt(25) |
| ; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:100 |
| ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:108 |
| ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:109 |
| ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110 |
| ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:111 |
| ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:112 |
| ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:113 |
| ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:114 |
| ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:115 |
| ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:116 |
| ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:117 |
| ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:118 |
| ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:119 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:101 |
| ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:120 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:102 |
| ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:121 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:103 |
| ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:122 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:104 |
| ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:123 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:105 |
| ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:124 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:106 |
| ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:125 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:107 |
| ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:126 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:127 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:108 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:109 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:110 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:111 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:112 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:113 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:114 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:115 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:116 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:117 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:118 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:119 |
| ; CHECK-NEXT: s_waitcnt vmcnt(25) |
| ; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:120 |
| ; CHECK-NEXT: s_waitcnt vmcnt(24) |
| ; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:121 |
| ; CHECK-NEXT: s_waitcnt vmcnt(23) |
| ; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:122 |
| ; CHECK-NEXT: s_waitcnt vmcnt(22) |
| ; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:123 |
| ; CHECK-NEXT: s_waitcnt vmcnt(21) |
| ; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:124 |
| ; CHECK-NEXT: s_waitcnt vmcnt(20) |
| ; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:125 |
| ; CHECK-NEXT: s_waitcnt vmcnt(19) |
| ; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:126 |
| ; CHECK-NEXT: s_waitcnt vmcnt(19) |
| ; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:127 |
| ; CHECK-NEXT: s_endpgm |
| entry: |
| tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) |
| ret void |
| } |
| |
| define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %src) #0 { |
| ; CHECK-LABEL: memcpy_p0_p5_minsize: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3] |
| ; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1] |
| ; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 |
| ; CHECK-NEXT: s_add_u32 s8, s8, s7 |
| ; CHECK-NEXT: s_addc_u32 s9, s9, 0 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: v_mov_b32_e32 v2, s0 |
| ; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen |
| ; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:1 |
| ; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:2 |
| ; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:3 |
| ; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:4 |
| ; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:5 |
| ; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:6 |
| ; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:7 |
| ; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:8 |
| ; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:9 |
| ; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:10 |
| ; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:11 |
| ; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:12 |
| ; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:13 |
| ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 |
| ; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:14 |
| ; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:15 |
| ; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:16 |
| ; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:17 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: v_mov_b32_e32 v0, s0 |
| ; CHECK-NEXT: v_mov_b32_e32 v1, s1 |
| ; CHECK-NEXT: s_waitcnt vmcnt(17) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 |
| ; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:18 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1 |
| ; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:19 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2 |
| ; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:20 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3 |
| ; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:21 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4 |
| ; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:22 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5 |
| ; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:23 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6 |
| ; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:24 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7 |
| ; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:25 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 |
| ; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:26 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9 |
| ; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:27 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10 |
| ; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:28 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11 |
| ; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:29 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12 |
| ; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:30 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13 |
| ; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:31 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14 |
| ; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:32 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 |
| ; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:33 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 |
| ; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:34 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 |
| ; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:35 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:18 |
| ; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:36 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:19 |
| ; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:37 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20 |
| ; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:38 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:21 |
| ; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:39 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:22 |
| ; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:40 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:23 |
| ; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:41 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24 |
| ; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:42 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 |
| ; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:43 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:26 |
| ; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:44 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:27 |
| ; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:45 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:28 |
| ; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:46 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:29 |
| ; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:47 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:30 |
| ; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:48 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:31 |
| ; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:49 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:32 |
| ; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:50 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:33 |
| ; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:51 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:34 |
| ; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:52 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:35 |
| ; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:53 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:36 |
| ; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:54 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:37 |
| ; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:55 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:38 |
| ; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:56 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39 |
| ; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:57 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:40 |
| ; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:58 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:41 |
| ; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:59 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:42 |
| ; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:60 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:43 |
| ; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:61 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:44 |
| ; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:62 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:45 |
| ; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:63 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:46 |
| ; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:64 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:47 |
| ; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:65 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:48 |
| ; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:66 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:49 |
| ; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:67 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:50 |
| ; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:68 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:51 |
| ; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:69 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:52 |
| ; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:70 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:53 |
| ; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:71 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:54 |
| ; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:72 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55 |
| ; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:73 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56 |
| ; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:74 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:57 |
| ; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:75 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:58 |
| ; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:76 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:59 |
| ; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:77 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:60 |
| ; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:78 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:61 |
| ; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:79 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:62 |
| ; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:80 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:63 |
| ; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:81 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:64 |
| ; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:82 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:65 |
| ; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:83 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:66 |
| ; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:84 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:67 |
| ; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:85 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:68 |
| ; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:86 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:69 |
| ; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:87 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:70 |
| ; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:88 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:71 |
| ; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:89 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72 |
| ; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:90 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74 |
| ; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:91 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:92 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75 |
| ; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:93 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76 |
| ; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:94 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77 |
| ; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:95 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78 |
| ; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:96 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79 |
| ; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:97 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:80 |
| ; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:98 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:81 |
| ; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:99 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:82 |
| ; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:100 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:83 |
| ; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:101 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:84 |
| ; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:102 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:85 |
| ; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:103 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:86 |
| ; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:104 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:87 |
| ; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:105 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:88 |
| ; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:106 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:89 |
| ; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:107 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90 |
| ; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:108 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:91 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:92 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:93 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:94 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:95 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:96 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:97 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:98 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:99 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:100 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:101 |
| ; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:109 |
| ; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:110 |
| ; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:111 |
| ; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:112 |
| ; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:113 |
| ; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:114 |
| ; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:115 |
| ; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:116 |
| ; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:117 |
| ; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:118 |
| ; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:119 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:102 |
| ; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:120 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:103 |
| ; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:121 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:104 |
| ; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:122 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:105 |
| ; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:123 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:106 |
| ; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:124 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:107 |
| ; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:125 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:108 |
| ; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:126 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:127 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:109 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:110 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:111 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:112 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:113 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:114 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:115 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:116 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:117 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:118 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:119 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:120 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:121 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:122 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:123 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:124 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:125 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:126 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:127 |
| ; CHECK-NEXT: s_endpgm |
| entry: |
| tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false) |
| ret void |
| } |
| |
| define amdgpu_kernel void @memcpy_p3_p4_minsize(ptr addrspace(4) %0) #0 { |
| ; CHECK-LABEL: memcpy_p3_p4_minsize: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 |
| ; CHECK-NEXT: v_mov_b32_e32 v24, 0 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 |
| ; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 |
| ; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80 |
| ; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 |
| ; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 |
| ; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 |
| ; CHECK-NEXT: s_waitcnt vmcnt(5) |
| ; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:14 offset1:15 |
| ; CHECK-NEXT: s_waitcnt vmcnt(4) |
| ; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:12 offset1:13 |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16 |
| ; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] |
| ; CHECK-NEXT: s_waitcnt vmcnt(5) |
| ; CHECK-NEXT: ds_write2_b64 v24, v[8:9], v[10:11] offset0:10 offset1:11 |
| ; CHECK-NEXT: s_waitcnt vmcnt(4) |
| ; CHECK-NEXT: ds_write2_b64 v24, v[12:13], v[14:15] offset0:8 offset1:9 |
| ; CHECK-NEXT: s_waitcnt vmcnt(3) |
| ; CHECK-NEXT: ds_write2_b64 v24, v[16:17], v[18:19] offset0:6 offset1:7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(2) |
| ; CHECK-NEXT: ds_write2_b64 v24, v[20:21], v[22:23] offset0:4 offset1:5 |
| ; CHECK-NEXT: s_waitcnt vmcnt(1) |
| ; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:2 offset1:3 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset1:1 |
| ; CHECK-NEXT: s_endpgm |
| entry: |
| tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) @shared, ptr addrspace(4) %0, i64 128, i1 false) |
| ret void |
| } |
| |
| define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 { |
| ; CHECK-LABEL: memcpy_p0_p3_minsize: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 |
| ; CHECK-NEXT: v_mov_b32_e32 v2, 0 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:127 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:126 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:125 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:124 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: v_mov_b32_e32 v0, s0 |
| ; CHECK-NEXT: v_mov_b32_e32 v1, s1 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:127 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:126 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:123 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:125 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:124 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:122 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:121 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:123 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:120 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:119 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:122 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:121 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:118 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:120 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:119 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:117 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:116 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:118 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:115 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:114 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:117 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:116 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:113 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:115 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:114 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:112 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:111 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:110 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:109 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:112 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:111 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:108 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:110 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:109 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:107 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:106 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:108 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:105 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:104 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:107 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:103 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:104 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:102 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:101 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:103 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:100 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:99 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:102 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:101 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:98 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:100 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:97 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:96 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:98 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:95 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:94 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:97 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:96 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:93 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:95 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:94 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:92 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:91 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:93 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:90 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:89 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:92 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:91 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:88 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:89 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:87 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:86 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:88 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:85 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:84 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:87 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:86 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:83 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:85 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:84 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:82 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:81 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:83 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:80 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:79 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:82 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:81 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:78 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:80 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:79 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:77 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:76 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:78 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:75 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:74 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:77 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:76 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:73 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:75 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:74 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:72 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:71 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:73 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:70 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:69 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:72 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:71 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:68 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:70 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:69 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:67 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:66 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:68 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:65 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:64 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:67 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:63 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:64 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:62 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:61 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:63 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:60 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:59 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:62 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:61 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:58 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:60 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:59 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:57 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:56 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:58 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:55 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:54 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:53 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:52 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:51 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:53 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:50 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:49 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:52 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:51 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:48 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:50 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:49 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:47 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:46 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:48 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:45 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:44 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:47 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:46 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:43 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:45 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:44 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:42 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:41 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:40 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:39 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:38 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:40 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:37 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:36 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:38 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:35 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:34 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:37 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:36 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:33 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:35 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:34 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:32 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:31 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:33 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:30 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:29 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:32 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:31 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:28 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:27 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:26 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:25 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:24 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:27 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:26 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:23 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:25 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:24 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:22 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:21 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:23 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:20 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:19 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:21 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:18 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:20 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:19 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:16 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:17 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:18 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:8 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:9 |
| ; CHECK-NEXT: ds_read_u8 v7, v2 offset:10 |
| ; CHECK-NEXT: ds_read_u8 v8, v2 offset:11 |
| ; CHECK-NEXT: ds_read_u8 v9, v2 offset:12 |
| ; CHECK-NEXT: ds_read_u8 v10, v2 offset:13 |
| ; CHECK-NEXT: ds_read_u8 v11, v2 offset:14 |
| ; CHECK-NEXT: ds_read_u8 v12, v2 offset:15 |
| ; CHECK-NEXT: ds_read_u8 v13, v2 |
| ; CHECK-NEXT: ds_read_u8 v14, v2 offset:1 |
| ; CHECK-NEXT: ds_read_u8 v15, v2 offset:2 |
| ; CHECK-NEXT: ds_read_u8 v16, v2 offset:3 |
| ; CHECK-NEXT: ds_read_u8 v17, v2 offset:4 |
| ; CHECK-NEXT: ds_read_u8 v18, v2 offset:5 |
| ; CHECK-NEXT: ds_read_u8 v19, v2 offset:6 |
| ; CHECK-NEXT: ds_read_u8 v2, v2 offset:7 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:17 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:15 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:14 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:13 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:12 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:10 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:9 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:8 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:6 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:5 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:4 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:2 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:1 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v13 |
| ; CHECK-NEXT: s_endpgm |
| entry: |
| tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false) |
| ret void |
| } |
| |
| define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 { |
| ; CHECK-LABEL: memcpy_p0_p0_optsize: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: v_mov_b32_e32 v0, s2 |
| ; CHECK-NEXT: v_mov_b32_e32 v1, s3 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] |
| ; CHECK-NEXT: v_mov_b32_e32 v3, s1 |
| ; CHECK-NEXT: v_mov_b32_e32 v2, s0 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:1 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:1 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:2 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:2 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:3 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:3 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:4 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:4 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:5 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:5 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:6 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:6 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:7 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:8 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:8 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:9 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:9 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:10 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:10 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:11 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:11 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:12 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:12 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:13 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:13 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:14 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:14 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:15 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:15 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:16 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:16 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:17 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:17 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:18 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:18 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:19 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:19 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:20 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:20 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:21 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:21 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:22 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:22 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:23 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:23 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:24 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:24 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:25 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:25 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:26 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:26 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:27 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:27 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:28 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:28 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:29 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:29 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:30 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:30 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:31 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:31 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:32 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:32 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:33 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:33 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:34 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:34 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:35 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:35 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:36 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:36 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:37 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:37 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:38 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:38 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:39 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:39 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:40 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:40 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:41 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:41 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:42 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:42 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:43 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:43 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:44 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:44 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:45 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:45 |
| ; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:46 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:46 |
| ; CHECK-NEXT: s_endpgm |
| entry: |
| tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false) |
| ret void |
| } |
| |
| define amdgpu_kernel void @memcpy_p1_p1_optsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #1 { |
| ; CHECK-LABEL: memcpy_p1_p1_optsize: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 |
| ; CHECK-NEXT: v_mov_b32_e32 v4, 0 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:32 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:32 |
| ; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:39 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:39 |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 |
| ; CHECK-NEXT: s_endpgm |
| entry: |
| tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 47, i1 false) |
| ret void |
| } |
| |
| define amdgpu_kernel void @memcpy_p1_p4_optsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #1 { |
| ; CHECK-LABEL: memcpy_p1_p4_optsize: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 |
| ; CHECK-NEXT: v_mov_b32_e32 v4, 0 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:32 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:48 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:64 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:80 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:96 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:112 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 |
| ; CHECK-NEXT: s_endpgm |
| entry: |
| tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) %global, ptr addrspace(4) %0, i64 128, i1 false) |
| ret void |
| } |
| |
| define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #1 { |
| ; CHECK-LABEL: memcpy_p5_p4_optsize: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3] |
| ; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1] |
| ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 |
| ; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0 |
| ; CHECK-NEXT: s_add_u32 s8, s8, s7 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] |
| ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:1 |
| ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:2 |
| ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:3 |
| ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:4 |
| ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:5 |
| ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:6 |
| ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:7 |
| ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:8 |
| ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:9 |
| ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:10 |
| ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:11 |
| ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:12 |
| ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:13 |
| ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:14 |
| ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:15 |
| ; CHECK-NEXT: s_addc_u32 s9, s9, 0 |
| ; CHECK-NEXT: v_mov_b32_e32 v1, s2 |
| ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:16 |
| ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:17 |
| ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:18 |
| ; CHECK-NEXT: s_waitcnt vmcnt(18) |
| ; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen |
| ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:19 |
| ; CHECK-NEXT: s_waitcnt vmcnt(19) |
| ; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:1 |
| ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:20 |
| ; CHECK-NEXT: s_waitcnt vmcnt(20) |
| ; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:2 |
| ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:21 |
| ; CHECK-NEXT: s_waitcnt vmcnt(21) |
| ; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:3 |
| ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:22 |
| ; CHECK-NEXT: s_waitcnt vmcnt(22) |
| ; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:4 |
| ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:23 |
| ; CHECK-NEXT: s_waitcnt vmcnt(23) |
| ; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:5 |
| ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:24 |
| ; CHECK-NEXT: s_waitcnt vmcnt(24) |
| ; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:6 |
| ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:25 |
| ; CHECK-NEXT: s_waitcnt vmcnt(25) |
| ; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:7 |
| ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:26 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:8 |
| ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:27 |
| ; CHECK-NEXT: s_waitcnt vmcnt(27) |
| ; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:9 |
| ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:28 |
| ; CHECK-NEXT: s_waitcnt vmcnt(28) |
| ; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:10 |
| ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:29 |
| ; CHECK-NEXT: s_waitcnt vmcnt(29) |
| ; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:11 |
| ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:30 |
| ; CHECK-NEXT: s_waitcnt vmcnt(30) |
| ; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:12 |
| ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:31 |
| ; CHECK-NEXT: s_waitcnt vmcnt(31) |
| ; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:13 |
| ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:32 |
| ; CHECK-NEXT: s_waitcnt vmcnt(32) |
| ; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:14 |
| ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:33 |
| ; CHECK-NEXT: s_waitcnt vmcnt(33) |
| ; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:15 |
| ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:34 |
| ; CHECK-NEXT: s_waitcnt vmcnt(34) |
| ; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:16 |
| ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:35 |
| ; CHECK-NEXT: s_waitcnt vmcnt(35) |
| ; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:17 |
| ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:36 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:18 |
| ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:37 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:19 |
| ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:38 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:20 |
| ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:39 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:21 |
| ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:40 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:22 |
| ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:41 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:23 |
| ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:42 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:24 |
| ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:43 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:25 |
| ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:44 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:26 |
| ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:45 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:27 |
| ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:46 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:28 |
| ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:47 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:29 |
| ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:48 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:30 |
| ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:49 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:31 |
| ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:50 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:32 |
| ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:51 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:33 |
| ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:52 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:34 |
| ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:53 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:35 |
| ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:54 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:36 |
| ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:55 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:37 |
| ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:56 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:38 |
| ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:57 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:39 |
| ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:58 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:40 |
| ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:59 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:41 |
| ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:60 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:42 |
| ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:61 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:43 |
| ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:62 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:44 |
| ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:63 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:45 |
| ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:64 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:46 |
| ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:65 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:47 |
| ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:66 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:48 |
| ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:67 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:49 |
| ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:68 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:50 |
| ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:69 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:51 |
| ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:70 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:52 |
| ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:71 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:53 |
| ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:72 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:54 |
| ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:73 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:55 |
| ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:74 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:56 |
| ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:75 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:57 |
| ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:76 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:58 |
| ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:77 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:59 |
| ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:78 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:60 |
| ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:79 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:61 |
| ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:80 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:62 |
| ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:81 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:63 |
| ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:82 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:64 |
| ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:83 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:65 |
| ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:84 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:66 |
| ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:85 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:67 |
| ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:86 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:68 |
| ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:87 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:69 |
| ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:88 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:70 |
| ; CHECK-NEXT: s_waitcnt vmcnt(35) |
| ; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:71 |
| ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:89 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:90 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:72 |
| ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:91 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:73 |
| ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:92 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:74 |
| ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:93 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:75 |
| ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:94 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:76 |
| ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:95 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:77 |
| ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:96 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:78 |
| ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:97 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:79 |
| ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:98 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:80 |
| ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:99 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:81 |
| ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:100 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:82 |
| ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:101 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:83 |
| ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:102 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:84 |
| ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:103 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:85 |
| ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:104 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:86 |
| ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:105 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:87 |
| ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:106 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:88 |
| ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:107 |
| ; CHECK-NEXT: s_waitcnt vmcnt(35) |
| ; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:89 |
| ; CHECK-NEXT: s_waitcnt vmcnt(35) |
| ; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:90 |
| ; CHECK-NEXT: s_waitcnt vmcnt(34) |
| ; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:91 |
| ; CHECK-NEXT: s_waitcnt vmcnt(33) |
| ; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:92 |
| ; CHECK-NEXT: s_waitcnt vmcnt(32) |
| ; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:93 |
| ; CHECK-NEXT: s_waitcnt vmcnt(31) |
| ; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:94 |
| ; CHECK-NEXT: s_waitcnt vmcnt(30) |
| ; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:95 |
| ; CHECK-NEXT: s_waitcnt vmcnt(29) |
| ; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:96 |
| ; CHECK-NEXT: s_waitcnt vmcnt(28) |
| ; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:97 |
| ; CHECK-NEXT: s_waitcnt vmcnt(27) |
| ; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:98 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:99 |
| ; CHECK-NEXT: s_waitcnt vmcnt(25) |
| ; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:100 |
| ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:108 |
| ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:109 |
| ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110 |
| ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:111 |
| ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:112 |
| ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:113 |
| ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:114 |
| ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:115 |
| ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:116 |
| ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:117 |
| ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:118 |
| ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:119 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:101 |
| ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:120 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:102 |
| ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:121 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:103 |
| ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:122 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:104 |
| ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:123 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:105 |
| ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:124 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:106 |
| ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:125 |
| ; CHECK-NEXT: s_waitcnt vmcnt(36) |
| ; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:107 |
| ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:126 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:127 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:108 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:109 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:110 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:111 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:112 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:113 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:114 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:115 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:116 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:117 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:118 |
| ; CHECK-NEXT: s_waitcnt vmcnt(26) |
| ; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:119 |
| ; CHECK-NEXT: s_waitcnt vmcnt(25) |
| ; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:120 |
| ; CHECK-NEXT: s_waitcnt vmcnt(24) |
| ; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:121 |
| ; CHECK-NEXT: s_waitcnt vmcnt(23) |
| ; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:122 |
| ; CHECK-NEXT: s_waitcnt vmcnt(22) |
| ; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:123 |
| ; CHECK-NEXT: s_waitcnt vmcnt(21) |
| ; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:124 |
| ; CHECK-NEXT: s_waitcnt vmcnt(20) |
| ; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:125 |
| ; CHECK-NEXT: s_waitcnt vmcnt(19) |
| ; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:126 |
| ; CHECK-NEXT: s_waitcnt vmcnt(19) |
| ; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:127 |
| ; CHECK-NEXT: s_endpgm |
| entry: |
| tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) |
| ret void |
| } |
| |
| define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %src) #1 { |
| ; CHECK-LABEL: memcpy_p0_p5_optsize: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3] |
| ; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1] |
| ; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 |
| ; CHECK-NEXT: s_add_u32 s8, s8, s7 |
| ; CHECK-NEXT: s_addc_u32 s9, s9, 0 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: v_mov_b32_e32 v2, s0 |
| ; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen |
| ; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:1 |
| ; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:2 |
| ; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:3 |
| ; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:4 |
| ; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:5 |
| ; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:6 |
| ; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:7 |
| ; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:8 |
| ; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:9 |
| ; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:10 |
| ; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:11 |
| ; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:12 |
| ; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:13 |
| ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 |
| ; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:14 |
| ; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:15 |
| ; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:16 |
| ; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:17 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: v_mov_b32_e32 v0, s0 |
| ; CHECK-NEXT: v_mov_b32_e32 v1, s1 |
| ; CHECK-NEXT: s_waitcnt vmcnt(17) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 |
| ; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:18 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1 |
| ; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:19 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2 |
| ; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:20 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3 |
| ; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:21 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4 |
| ; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:22 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5 |
| ; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:23 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6 |
| ; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:24 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7 |
| ; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:25 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 |
| ; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:26 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9 |
| ; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:27 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10 |
| ; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:28 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11 |
| ; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:29 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12 |
| ; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:30 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13 |
| ; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:31 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14 |
| ; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:32 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 |
| ; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:33 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 |
| ; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:34 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 |
| ; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:35 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:18 |
| ; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:36 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:19 |
| ; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:37 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20 |
| ; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:38 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:21 |
| ; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:39 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:22 |
| ; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:40 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:23 |
| ; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:41 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24 |
| ; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:42 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 |
| ; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:43 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:26 |
| ; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:44 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:27 |
| ; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:45 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:28 |
| ; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:46 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:29 |
| ; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:47 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:30 |
| ; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:48 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:31 |
| ; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:49 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:32 |
| ; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:50 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:33 |
| ; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:51 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:34 |
| ; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:52 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:35 |
| ; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:53 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:36 |
| ; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:54 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:37 |
| ; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:55 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:38 |
| ; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:56 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39 |
| ; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:57 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:40 |
| ; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:58 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:41 |
| ; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:59 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:42 |
| ; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:60 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:43 |
| ; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:61 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:44 |
| ; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:62 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:45 |
| ; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:63 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:46 |
| ; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:64 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:47 |
| ; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:65 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:48 |
| ; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:66 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:49 |
| ; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:67 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:50 |
| ; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:68 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:51 |
| ; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:69 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:52 |
| ; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:70 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:53 |
| ; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:71 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:54 |
| ; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:72 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55 |
| ; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:73 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56 |
| ; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:74 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:57 |
| ; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:75 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:58 |
| ; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:76 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:59 |
| ; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:77 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:60 |
| ; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:78 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:61 |
| ; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:79 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:62 |
| ; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:80 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:63 |
| ; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:81 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:64 |
| ; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:82 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:65 |
| ; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:83 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:66 |
| ; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:84 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:67 |
| ; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:85 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:68 |
| ; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:86 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:69 |
| ; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:87 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:70 |
| ; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:88 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:71 |
| ; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:89 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72 |
| ; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:90 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74 |
| ; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:91 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:92 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75 |
| ; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:93 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76 |
| ; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:94 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77 |
| ; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:95 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78 |
| ; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:96 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79 |
| ; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:97 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:80 |
| ; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:98 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:81 |
| ; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:99 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:82 |
| ; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:100 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:83 |
| ; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:101 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:84 |
| ; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:102 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:85 |
| ; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:103 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:86 |
| ; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:104 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:87 |
| ; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:105 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:88 |
| ; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:106 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:89 |
| ; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:107 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90 |
| ; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:108 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:91 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:92 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:93 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:94 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:95 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:96 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:97 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:98 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:99 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:100 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:101 |
| ; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:109 |
| ; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:110 |
| ; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:111 |
| ; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:112 |
| ; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:113 |
| ; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:114 |
| ; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:115 |
| ; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:116 |
| ; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:117 |
| ; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:118 |
| ; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:119 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:102 |
| ; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:120 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:103 |
| ; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:121 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:104 |
| ; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:122 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:105 |
| ; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:123 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:106 |
| ; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:124 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:107 |
| ; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:125 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:108 |
| ; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:126 |
| ; CHECK-NEXT: s_nop 0 |
| ; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:127 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:109 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:110 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:111 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:112 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:113 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:114 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:115 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:116 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:117 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:118 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:119 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:120 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:121 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:122 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:123 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:124 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:125 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:126 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:127 |
| ; CHECK-NEXT: s_endpgm |
| entry: |
| tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false) |
| ret void |
| } |
| |
| define amdgpu_kernel void @memcpy_p3_p4_optsize(ptr addrspace(4) %0) #1 { |
| ; CHECK-LABEL: memcpy_p3_p4_optsize: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 |
| ; CHECK-NEXT: v_mov_b32_e32 v24, 0 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 |
| ; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 |
| ; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80 |
| ; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 |
| ; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 |
| ; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 |
| ; CHECK-NEXT: s_waitcnt vmcnt(5) |
| ; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:14 offset1:15 |
| ; CHECK-NEXT: s_waitcnt vmcnt(4) |
| ; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:12 offset1:13 |
| ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16 |
| ; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] |
| ; CHECK-NEXT: s_waitcnt vmcnt(5) |
| ; CHECK-NEXT: ds_write2_b64 v24, v[8:9], v[10:11] offset0:10 offset1:11 |
| ; CHECK-NEXT: s_waitcnt vmcnt(4) |
| ; CHECK-NEXT: ds_write2_b64 v24, v[12:13], v[14:15] offset0:8 offset1:9 |
| ; CHECK-NEXT: s_waitcnt vmcnt(3) |
| ; CHECK-NEXT: ds_write2_b64 v24, v[16:17], v[18:19] offset0:6 offset1:7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(2) |
| ; CHECK-NEXT: ds_write2_b64 v24, v[20:21], v[22:23] offset0:4 offset1:5 |
| ; CHECK-NEXT: s_waitcnt vmcnt(1) |
| ; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:2 offset1:3 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset1:1 |
| ; CHECK-NEXT: s_endpgm |
| entry: |
| tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) @shared, ptr addrspace(4) %0, i64 128, i1 false) |
| ret void |
| } |
| |
| define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 { |
| ; CHECK-LABEL: memcpy_p0_p3_optsize: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 |
| ; CHECK-NEXT: v_mov_b32_e32 v2, 0 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:127 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:126 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:125 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:124 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: v_mov_b32_e32 v0, s0 |
| ; CHECK-NEXT: v_mov_b32_e32 v1, s1 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:127 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:126 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:123 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:125 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:124 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:122 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:121 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:123 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:120 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:119 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:122 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:121 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:118 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:120 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:119 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:117 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:116 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:118 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:115 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:114 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:117 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:116 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:113 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:115 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:114 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:112 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:111 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:110 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:109 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:112 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:111 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:108 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:110 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:109 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:107 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:106 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:108 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:105 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:104 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:107 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:103 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:104 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:102 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:101 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:103 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:100 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:99 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:102 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:101 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:98 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:100 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:97 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:96 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:98 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:95 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:94 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:97 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:96 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:93 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:95 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:94 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:92 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:91 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:93 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:90 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:89 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:92 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:91 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:88 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:89 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:87 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:86 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:88 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:85 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:84 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:87 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:86 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:83 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:85 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:84 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:82 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:81 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:83 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:80 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:79 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:82 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:81 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:78 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:80 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:79 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:77 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:76 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:78 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:75 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:74 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:77 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:76 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:73 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:75 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:74 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:72 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:71 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:73 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:70 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:69 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:72 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:71 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:68 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:70 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:69 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:67 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:66 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:68 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:65 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:64 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:67 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:63 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:64 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:62 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:61 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:63 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:60 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:59 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:62 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:61 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:58 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:60 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:59 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:57 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:56 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:58 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:55 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:54 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:53 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:52 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:51 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:53 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:50 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:49 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:52 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:51 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:48 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:50 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:49 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:47 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:46 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:48 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:45 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:44 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:47 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:46 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:43 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:45 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:44 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:42 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:41 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:40 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:39 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:38 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:40 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:37 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:36 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:38 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:35 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:34 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:37 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:36 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:33 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:35 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:34 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:32 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:31 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:33 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:30 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:29 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:32 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:31 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:28 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:27 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:26 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:25 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:24 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:27 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:26 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:23 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:25 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:24 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:22 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:21 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:23 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:20 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:19 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:21 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:18 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:20 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:19 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 offset:16 |
| ; CHECK-NEXT: ds_read_u8 v5, v2 offset:17 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:18 |
| ; CHECK-NEXT: ds_read_u8 v4, v2 offset:8 |
| ; CHECK-NEXT: ds_read_u8 v6, v2 offset:9 |
| ; CHECK-NEXT: ds_read_u8 v7, v2 offset:10 |
| ; CHECK-NEXT: ds_read_u8 v8, v2 offset:11 |
| ; CHECK-NEXT: ds_read_u8 v9, v2 offset:12 |
| ; CHECK-NEXT: ds_read_u8 v10, v2 offset:13 |
| ; CHECK-NEXT: ds_read_u8 v11, v2 offset:14 |
| ; CHECK-NEXT: ds_read_u8 v12, v2 offset:15 |
| ; CHECK-NEXT: ds_read_u8 v13, v2 |
| ; CHECK-NEXT: ds_read_u8 v14, v2 offset:1 |
| ; CHECK-NEXT: ds_read_u8 v15, v2 offset:2 |
| ; CHECK-NEXT: ds_read_u8 v16, v2 offset:3 |
| ; CHECK-NEXT: ds_read_u8 v17, v2 offset:4 |
| ; CHECK-NEXT: ds_read_u8 v18, v2 offset:5 |
| ; CHECK-NEXT: ds_read_u8 v19, v2 offset:6 |
| ; CHECK-NEXT: ds_read_u8 v2, v2 offset:7 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:17 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:15 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:14 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:13 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:12 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:10 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:9 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:8 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:6 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:5 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:4 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:2 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:1 |
| ; CHECK-NEXT: flat_store_byte v[0:1], v13 |
| ; CHECK-NEXT: s_endpgm |
| entry: |
| tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false) |
| ret void |
| } |
| |
| declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #2 |
| |
| declare void @llvm.memcpy.p0.p5.i64(ptr noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #2 |
| |
| declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2 |
| |
| declare void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2 |
| |
| declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2 |
| |
| declare void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2 |
| |
| declare void @llvm.memcpy.p0.p3.i64(ptr noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #2 |
| |
| attributes #0 = { minsize } |
| attributes #1 = { optsize } |
| attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } |