|  | ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py | 
|  | ; RUN:  opt -S -mtriple=amdgcn-- -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR %s | 
|  |  | 
|  | define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) { | 
|  | ; IR-LABEL: @atomic_add_i32_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) | 
|  | ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] | 
|  | ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] | 
|  | ; IR:       10: | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP12]] | 
|  | ; IR:       12: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 | 
|  | %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { | 
|  | ; IR-LABEL: @atomic_add_i32_max_neg_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 -1024 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) | 
|  | ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] | 
|  | ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] | 
|  | ; IR:       10: | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP12]] | 
|  | ; IR:       12: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %gep = getelementptr i32, ptr addrspace(1) %out, i64 -1024 | 
|  | %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in) { | 
|  | ; IR-LABEL: @atomic_add_i32_soffset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 9000 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) | 
|  | ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] | 
|  | ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] | 
|  | ; IR:       10: | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP12]] | 
|  | ; IR:       12: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %gep = getelementptr i32, ptr addrspace(1) %out, i64 9000 | 
|  | %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 %in) { | 
|  | ; IR-LABEL: @atomic_add_i32_huge_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 47224239175595 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) | 
|  | ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] | 
|  | ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] | 
|  | ; IR:       10: | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP12]] | 
|  | ; IR:       12: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %gep = getelementptr i32, ptr addrspace(1) %out, i64 47224239175595 | 
|  |  | 
|  | %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { | 
|  | ; IR-LABEL: @atomic_add_i32_ret_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) | 
|  | ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] | 
|  | ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] | 
|  | ; IR:       10: | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP12]] | 
|  | ; IR:       12: | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] | 
|  | ; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) | 
|  | ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] | 
|  | ; IR-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]] | 
|  | ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 | 
|  | %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_add_i32_addr64_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) | 
|  | ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] | 
|  | ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] | 
|  | ; IR:       10: | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP12]] | 
|  | ; IR:       12: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 | 
|  | %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_add_i32_ret_addr64_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) | 
|  | ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] | 
|  | ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] | 
|  | ; IR:       10: | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP12]] | 
|  | ; IR:       12: | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] | 
|  | ; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) | 
|  | ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] | 
|  | ; IR-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]] | 
|  | ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 | 
|  | %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { | 
|  | ; IR-LABEL: @atomic_add_i32( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) | 
|  | ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] | 
|  | ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] | 
|  | ; IR:       10: | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP12]] | 
|  | ; IR:       12: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { | 
|  | ; IR-LABEL: @atomic_add_i32_ret( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) | 
|  | ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] | 
|  | ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] | 
|  | ; IR:       10: | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP12]] | 
|  | ; IR:       12: | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] | 
|  | ; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) | 
|  | ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] | 
|  | ; IR-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]] | 
|  | ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_add_i32_addr64( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) | 
|  | ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] | 
|  | ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] | 
|  | ; IR:       10: | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP12]] | 
|  | ; IR:       12: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_add_i32_ret_addr64( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) | 
|  | ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] | 
|  | ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] | 
|  | ; IR:       10: | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP12]] | 
|  | ; IR:       12: | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] | 
|  | ; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) | 
|  | ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] | 
|  | ; IR-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]] | 
|  | ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) { | 
|  | ; IR-LABEL: @atomic_and_i32_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 | 
|  | %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { | 
|  | ; IR-LABEL: @atomic_and_i32_ret_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) | 
|  | ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]] | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]] | 
|  | ; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 | 
|  | %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_and_i32_addr64_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 | 
|  | %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_and_i32_ret_addr64_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) | 
|  | ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]] | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]] | 
|  | ; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 | 
|  | %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { | 
|  | ; IR-LABEL: @atomic_and_i32( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { | 
|  | ; IR-LABEL: @atomic_and_i32_ret( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) | 
|  | ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]] | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]] | 
|  | ; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_and_i32_addr64( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_and_i32_ret_addr64( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) | 
|  | ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]] | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]] | 
|  | ; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) { | 
|  | ; IR-LABEL: @atomic_sub_i32_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) | 
|  | ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] | 
|  | ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] | 
|  | ; IR:       10: | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP12]] | 
|  | ; IR:       12: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 | 
|  | %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { | 
|  | ; IR-LABEL: @atomic_sub_i32_ret_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) | 
|  | ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] | 
|  | ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] | 
|  | ; IR:       10: | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP12]] | 
|  | ; IR:       12: | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] | 
|  | ; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) | 
|  | ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] | 
|  | ; IR-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] | 
|  | ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 | 
|  | %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_sub_i32_addr64_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) | 
|  | ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] | 
|  | ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] | 
|  | ; IR:       10: | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP12]] | 
|  | ; IR:       12: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 | 
|  | %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_sub_i32_ret_addr64_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) | 
|  | ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] | 
|  | ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] | 
|  | ; IR:       10: | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP12]] | 
|  | ; IR:       12: | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] | 
|  | ; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) | 
|  | ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] | 
|  | ; IR-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] | 
|  | ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 | 
|  | %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { | 
|  | ; IR-LABEL: @atomic_sub_i32( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) | 
|  | ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] | 
|  | ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] | 
|  | ; IR:       10: | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP12]] | 
|  | ; IR:       12: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { | 
|  | ; IR-LABEL: @atomic_sub_i32_ret( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) | 
|  | ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] | 
|  | ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] | 
|  | ; IR:       10: | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP12]] | 
|  | ; IR:       12: | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] | 
|  | ; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) | 
|  | ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] | 
|  | ; IR-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] | 
|  | ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_sub_i32_addr64( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) | 
|  | ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] | 
|  | ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] | 
|  | ; IR:       10: | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP12]] | 
|  | ; IR:       12: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_sub_i32_ret_addr64( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) | 
|  | ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] | 
|  | ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] | 
|  | ; IR:       10: | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP12]] | 
|  | ; IR:       12: | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] | 
|  | ; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) | 
|  | ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] | 
|  | ; IR-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] | 
|  | ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) { | 
|  | ; IR-LABEL: @atomic_max_i32_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 | 
|  | %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { | 
|  | ; IR-LABEL: @atomic_max_i32_ret_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) | 
|  | ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]] | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] | 
|  | ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] | 
|  | ; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 | 
|  | %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_max_i32_addr64_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 | 
|  | %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_max_i32_ret_addr64_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) | 
|  | ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]] | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] | 
|  | ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] | 
|  | ; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 | 
|  | %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) { | 
|  | ; IR-LABEL: @atomic_max_i32( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { | 
|  | ; IR-LABEL: @atomic_max_i32_ret( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) | 
|  | ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]] | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] | 
|  | ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] | 
|  | ; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_max_i32_addr64( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_max_i32_ret_addr64( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) | 
|  | ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]] | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] | 
|  | ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] | 
|  | ; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in) { | 
|  | ; IR-LABEL: @atomic_umax_i32_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 | 
|  | %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { | 
|  | ; IR-LABEL: @atomic_umax_i32_ret_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) | 
|  | ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]] | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]] | 
|  | ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] | 
|  | ; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 | 
|  | %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_umax_i32_addr64_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 | 
|  | %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_umax_i32_ret_addr64_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) | 
|  | ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]] | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]] | 
|  | ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] | 
|  | ; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 | 
|  | %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) { | 
|  | ; IR-LABEL: @atomic_umax_i32( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { | 
|  | ; IR-LABEL: @atomic_umax_i32_ret( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) | 
|  | ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]] | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]] | 
|  | ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] | 
|  | ; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_umax_i32_addr64( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_umax_i32_ret_addr64( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) | 
|  | ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]] | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]] | 
|  | ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] | 
|  | ; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) { | 
|  | ; IR-LABEL: @atomic_min_i32_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 | 
|  | %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { | 
|  | ; IR-LABEL: @atomic_min_i32_ret_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) | 
|  | ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]] | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]] | 
|  | ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] | 
|  | ; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 | 
|  | %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_min_i32_addr64_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 | 
|  | %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_min_i32_ret_addr64_offset( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) | 
|  | ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]] | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]] | 
|  | ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] | 
|  | ; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 | 
|  | %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { | 
|  | ; IR-LABEL: @atomic_min_i32( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { | 
|  | ; IR-LABEL: @atomic_min_i32_ret( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) | 
|  | ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]] | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]] | 
|  | ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] | 
|  | ; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_min_i32_addr64( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { | 
|  | ; IR-LABEL: @atomic_min_i32_ret_addr64( | 
|  | ; IR-NEXT:  entry: | 
|  | ; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] | 
|  | ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) | 
|  | ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 | 
|  | ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 | 
|  | ; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 | 
|  | ; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) | 
|  | ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) | 
|  | ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 | 
|  | ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] | 
|  | ; IR:       7: | 
|  | ; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 | 
|  | ; IR-NEXT:    br label [[TMP9]] | 
|  | ; IR:       9: | 
|  | ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] | 
|  | ; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) | 
|  | ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]] | 
|  | ; IR-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]] | 
|  | ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] | 
|  | ; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 | 
|  | ; IR-NEXT:    ret void | 
|  | ; | 
|  | entry: | 
|  | %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index | 
|  | %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst | 
|  | store i32 %val, ptr addrspace(1) %out2 | 
|  | ret void | 
|  | } |