blob: 2a1824b0ca4aef823f731349cd2c0fcd6dab3998 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX803 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX906 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX908 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX90A %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX940 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX10 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX11 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX12 %s
;---------------------------------------------------------------------
; TODO: atomicrmw xchg
;---------------------------------------------------------------------
; ; xchg is supported over PCIe, so no expansion is necessary
; define <2 x half> @test_atomicrmw_xchg_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) {
; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value seq_cst
; ret <2 x half> %res
; }
; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
; define <2 x half> @test_atomicrmw_xchg_v2f16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0
; ret <2 x half> %res
; }
; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
; define <2 x half> @test_atomicrmw_xchg_v2f16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory.access !0
; ret <2 x half> %res
; }
; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
; define <2 x half> @test_atomicrmw_xchg_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
; ret <2 x half> %res
; }
;---------------------------------------------------------------------
; atomicrmw fadd
;---------------------------------------------------------------------
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory_access(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory.access !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2f16_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2f16_daz(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2f16_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2f16_dynamic(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
;---------------------------------------------------------------------
; atomicrmw fsub
;---------------------------------------------------------------------
define <2 x half> @test_atomicrmw_fsub_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grained_memory(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_remote_memory_access(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory.access !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP5]]
;
%res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP5]]
;
%res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP5]]
;
%res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP5]]
;
%res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
;---------------------------------------------------------------------
; atomicrmw fmax
;---------------------------------------------------------------------
define <2 x half> @test_atomicrmw_fmax_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grained_memory(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_remote_memory_access(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory.access !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
;---------------------------------------------------------------------
; atomicrmw fmin
;---------------------------------------------------------------------
define <2 x half> @test_atomicrmw_fmin_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grained_memory(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_remote_memory_access(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory.access !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" }
attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" }
!0 = !{}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX10: {{.*}}
; GFX11: {{.*}}
; GFX12: {{.*}}
; GFX803: {{.*}}
; GFX906: {{.*}}
; GFX908: {{.*}}
; GFX90A: {{.*}}
; GFX940: {{.*}}