llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll - third_party/github.com/llvm/llvm-project - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s

 ; Simples case, if - then, that requires lane mask merging,
 ; %phi lane mask will hold %val_A at %A. Lanes that are active in %B
 ; will overwrite its own lane bit in lane mask with val_B
 define amdgpu_ps void @divergent_i1_phi_if_then(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
 ; GFX10-LABEL: divergent_i1_phi_if_then:
 ; GFX10:       ; %bb.0: ; %A
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, 6, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-NEXT:  ; %bb.1: ; %B
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 1, v2
 ; GFX10-NEXT:  ; %bb.2: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 A:
   %val_A = icmp uge i32 %tid, 6
   %cmp = icmp eq i32 %cond, 0
   br i1 %cmp, label %B, label %exit

 B:
   %val_B = icmp ult i32 %tid, 1
   br label %exit

 exit:
   %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
   %sel = select i1 %phi, i32 1, i32 2
   store i32 %sel, ptr addrspace(1) %out
   ret void
 }

 ; if - else
 define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
 ; GFX10-LABEL: divergent_i1_phi_if_else:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_and_b32 s0, 1, s0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX10-NEXT:  ; %bb.1: ; %B
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 2, v2
 ; GFX10-NEXT:    ; implicit-def: $vgpr2
 ; GFX10-NEXT:  ; %bb.2: ; %Flow
 ; GFX10-NEXT:    s_andn2_saveexec_b32 s1, s1
 ; GFX10-NEXT:  ; %bb.3: ; %A
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, 1, v2
 ; GFX10-NEXT:  ; %bb.4: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 entry:
   %cmp = icmp eq i32 %cond, 0
   br i1 %cmp, label %A, label %B

 A:
   %val_A = icmp uge i32 %tid, 1
   br label %exit

 B:
   %val_B = icmp ult i32 %tid, 2
   br label %exit

 exit:
   %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
   %sel = select i1 %phi, i32 1, i32 2
   store i32 %sel, ptr addrspace(1) %out
   ret void
 }

 ; if - break;

 ;  counter = 0;
 ;  do {
 ;    if (a[counter] == 0)
 ;      break;
 ;    if (b[counter] == 0)
 ;      break;
 ;    if (c[counter] == 0)
 ;      break;
 ;    x[counter++]+=1;
 ;  } while (counter<100);

 ; Tests with multiple break conditions. Divergent phis will be used to track
 ; if any of the break conditions was reached. We only need to do simple lane
 ; mask merging (for current loop iteration only). There is an intrinsic,
 ; if_break, that will merge lane masks across all iterations of the loop.

 define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a) {
 ; GFX10-LABEL: loop_with_1break:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_mov_b32 s0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX10-NEXT:    s_branch .LBB2_2
 ; GFX10-NEXT:  .LBB2_1: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB2_2 Depth=1
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-NEXT:    s_and_b32 s1, exec_lo, s2
 ; GFX10-NEXT:    s_or_b32 s0, s1, s0
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX10-NEXT:  .LBB2_2: ; %A
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
 ; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    v_lshlrev_b64 v[5:6], 2, v[4:5]
 ; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v2, v5
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v3, v6, vcc_lo
 ; GFX10-NEXT:    global_load_dword v7, v[7:8], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v7
 ; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB2_1
 ; GFX10-NEXT:  ; %bb.3: ; %loop.body
 ; GFX10-NEXT:    ; in Loop: Header=BB2_2 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v5, vcc_lo, v0, v5
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v1, v6, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v8, 1, v4
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s2, 0x64, v4
 ; GFX10-NEXT:    global_load_dword v7, v[5:6], off
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v8
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, 1, v7
 ; GFX10-NEXT:    global_store_dword v[5:6], v7, off
 ; GFX10-NEXT:    s_branch .LBB2_1
 ; GFX10-NEXT:  .LBB2_4: ; %exit
 ; GFX10-NEXT:    s_endpgm
 entry:
   br label %A

 A:
   %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
   %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
   %a.val = load i32, ptr addrspace(1) %a.plus.counter
   %a.cond = icmp eq i32 %a.val, 0
   br i1 %a.cond, label %exit, label %loop.body

 loop.body:
   %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
   %x.val = load i32, ptr addrspace(1) %x.plus.counter
   %x.val.plus.1 = add i32 %x.val, 1
   store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
   %counter.plus.1 = add i32 %counter, 1
   %x.cond = icmp ult i32 %counter, 100
   br i1 %x.cond, label %exit, label %A

 exit:
   ret void
 }

 define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b) {
 ; GFX10-LABEL: loop_with_2breaks:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_mov_b32 s0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s0
 ; GFX10-NEXT:    s_branch .LBB3_3
 ; GFX10-NEXT:  .LBB3_1: ; %Flow3
 ; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX10-NEXT:  .LBB3_2: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-NEXT:    s_and_b32 s1, exec_lo, s2
 ; GFX10-NEXT:    s_or_b32 s0, s1, s0
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_6
 ; GFX10-NEXT:  .LBB3_3: ; %A
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
 ; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    v_lshlrev_b64 v[7:8], 2, v[6:7]
 ; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v2, v7
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
 ; GFX10-NEXT:    global_load_dword v9, v[9:10], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
 ; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX10-NEXT:  ; %bb.4: ; %B
 ; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v4, v7
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo
 ; GFX10-NEXT:    global_load_dword v9, v[9:10], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
 ; GFX10-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_1
 ; GFX10-NEXT:  ; %bb.5: ; %loop.body
 ; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v0, v7
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v6
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s2, 0x64, v6
 ; GFX10-NEXT:    global_load_dword v9, v[7:8], off
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v10
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v9
 ; GFX10-NEXT:    global_store_dword v[7:8], v9, off
 ; GFX10-NEXT:    s_branch .LBB3_1
 ; GFX10-NEXT:  .LBB3_6: ; %exit
 ; GFX10-NEXT:    s_endpgm
 entry:
   br label %A

 A:
   %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
   %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
   %a.val = load i32, ptr addrspace(1) %a.plus.counter
   %a.cond = icmp eq i32 %a.val, 0
   br i1 %a.cond, label %exit, label %B

 B:
   %b.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %b, i32 %counter
   %b.val = load i32, ptr addrspace(1) %b.plus.counter
   %b.cond = icmp eq i32 %b.val, 0
   br i1 %b.cond, label %exit, label %loop.body

 loop.body:
   %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
   %x.val = load i32, ptr addrspace(1) %x.plus.counter
   %x.val.plus.1 = add i32 %x.val, 1
   store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
   %counter.plus.1 = add i32 %counter, 1
   %x.cond = icmp ult i32 %counter, 100
   br i1 %x.cond, label %exit, label %A

 exit:
   ret void
 }

 define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c) {
 ; GFX10-LABEL: loop_with_3breaks:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_mov_b32 s0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v8, s0
 ; GFX10-NEXT:    s_branch .LBB4_4
 ; GFX10-NEXT:  .LBB4_1: ; %Flow5
 ; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:  .LBB4_2: ; %Flow4
 ; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX10-NEXT:  .LBB4_3: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-NEXT:    s_and_b32 s1, exec_lo, s2
 ; GFX10-NEXT:    s_or_b32 s0, s1, s0
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_8
 ; GFX10-NEXT:  .LBB4_4: ; %A
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
 ; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    v_lshlrev_b64 v[9:10], 2, v[8:9]
 ; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v2, v9
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v3, v10, vcc_lo
 ; GFX10-NEXT:    global_load_dword v11, v[11:12], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
 ; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX10-NEXT:  ; %bb.5: ; %B
 ; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v4, v9
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v5, v10, vcc_lo
 ; GFX10-NEXT:    global_load_dword v11, v[11:12], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
 ; GFX10-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX10-NEXT:  ; %bb.6: ; %C
 ; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v6, v9
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v7, v10, vcc_lo
 ; GFX10-NEXT:    global_load_dword v11, v[11:12], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_1
 ; GFX10-NEXT:  ; %bb.7: ; %loop.body
 ; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v0, v9
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v1, v10, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v12, 1, v8
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s2, 0x64, v8
 ; GFX10-NEXT:    global_load_dword v11, v[9:10], off
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v12
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v11
 ; GFX10-NEXT:    global_store_dword v[9:10], v11, off
 ; GFX10-NEXT:    s_branch .LBB4_1
 ; GFX10-NEXT:  .LBB4_8: ; %exit
 ; GFX10-NEXT:    s_endpgm
 entry:
   br label %A

 A:
   %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
   %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
   %a.val = load i32, ptr addrspace(1) %a.plus.counter
   %a.cond = icmp eq i32 %a.val, 0
   br i1 %a.cond, label %exit, label %B

 B:
   %b.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %b, i32 %counter
   %b.val = load i32, ptr addrspace(1) %b.plus.counter
   %b.cond = icmp eq i32 %b.val, 0
   br i1 %b.cond, label %exit, label %C

 C:
   %c.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %c, i32 %counter
   %c.val = load i32, ptr addrspace(1) %c.plus.counter
   %c.cond = icmp eq i32 %c.val, 0
   br i1 %c.cond, label %exit, label %loop.body

 loop.body:
   %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
   %x.val = load i32, ptr addrspace(1) %x.plus.counter
   %x.val.plus.1 = add i32 %x.val, 1
   store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
   %counter.plus.1 = add i32 %counter, 1
   %x.cond = icmp ult i32 %counter, 100
   br i1 %x.cond, label %exit, label %A

 exit:
   ret void
 }

 ; Divergent condition if with body, ending with break. This is loop with two
 ; exits but structurizer will create phi that will track exit from break
 ; and move break.body after the loop. Loop will then have one exit and phi
 ; used outside of the loop by condition used to enter the break.body.
 define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) {
 ; GFX10-LABEL: loop_with_div_break_with_body:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_mov_b32 s0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s0
 ; GFX10-NEXT:    s_branch .LBB5_2
 ; GFX10-NEXT:  .LBB5_1: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB5_2 Depth=1
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-NEXT:    s_and_b32 s1, exec_lo, s2
 ; GFX10-NEXT:    s_or_b32 s0, s1, s0
 ; GFX10-NEXT:    s_and_b32 s1, 1, s3
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, s1
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX10-NEXT:  .LBB5_2: ; %A
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
 ; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    s_mov_b32 s3, 1
 ; GFX10-NEXT:    v_lshlrev_b64 v[7:8], 2, v[6:7]
 ; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v2, v7
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
 ; GFX10-NEXT:    global_load_dword v9, v[9:10], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
 ; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB5_1
 ; GFX10-NEXT:  ; %bb.3: ; %loop.body
 ; GFX10-NEXT:    ; in Loop: Header=BB5_2 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v0, v7
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v6
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s2, 0x64, v6
 ; GFX10-NEXT:    s_mov_b32 s3, 0
 ; GFX10-NEXT:    global_load_dword v9, v[7:8], off
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v10
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v9
 ; GFX10-NEXT:    global_store_dword v[7:8], v9, off
 ; GFX10-NEXT:    s_branch .LBB5_1
 ; GFX10-NEXT:  .LBB5_4: ; %loop.exit.guard
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_and_saveexec_b32 s0, s1
 ; GFX10-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX10-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX10-NEXT:  ; %bb.5: ; %break.body
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 10
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX10-NEXT:  .LBB5_6: ; %exit
 ; GFX10-NEXT:    s_endpgm
 entry:
   br label %A

 A:
   %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
   %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
   %a.val = load i32, ptr addrspace(1) %a.plus.counter
   %a.cond = icmp eq i32 %a.val, 0
   br i1 %a.cond, label %break.body, label %loop.body

 break.body:
   store i32 10, ptr addrspace(1) %a.break
   br label %exit


 loop.body:
   %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
   %x.val = load i32, ptr addrspace(1) %x.plus.counter
   %x.val.plus.1 = add i32 %x.val, 1
   store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
   %counter.plus.1 = add i32 %counter, 1
   %x.cond = icmp ult i32 %counter, 100
   br i1 %x.cond, label %exit, label %A

 exit:
   ret void
 }

 ; Snippet from test generated by the GraphicsFuzz tool, frontend generates ir
 ; with irreducible control flow graph. FixIrreducible converts it into natural
 ; loop and in the process creates i1 phi with three incoming values.

 ; int loop(int x, int y, int a0, int a1, int a2, int a3, int a4) {
 ;   do {
 ;     if (y < a2) {
 ;       do {
 ;       } while (x < a2);
 ;     }
 ;     if (x < a3) {
 ;       return a1;
 ;     }
 ;   } while (y < a2);
 ;   return a0;
 ; }

 ; This test is also interesting because it has phi with three incomings
 ;define amdgpu_ps i32 @irreducible_cfg(i32 %x, i32 %y, i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ;.entry:
 ; %.y_lt_a2 = icmp sgt i32 %a2, %y
 ; %.x_lt_a2 = icmp sgt i32 %a2, %x
 ; %.x_lt_a3 = icmp sgt i32 %a3, %x
 ; br i1 %.y_lt_a2, label %.preheader, label %.loopexit ; first iteration, jump to inner loop if 'y < a2' or start with 'if (x < a3)'
 ;
 ;.preheader: ; if (y < a2),
 ; br label %.inner_loop
 ;
 ;.inner_loop: ; do while x < a2
 ; br i1 %.x_lt_a2, label %.inner_loop, label %.loopexit
 ;
 ;.loopexit: ; if x < a3
 ; %not.inner_loop = xor i1 %.y_lt_a2, true
 ; %brmerge = select i1 %.x_lt_a3, i1 true, i1 %not.inner_loop ; exit loop if 'x < a3' or 'loop ends since !(y < a2)'
 ; %.ret = select i1 %.x_lt_a3, i32 %a1, i32 %a0               ; select retrun value a1 'x < a3' or a0 'loop ends'
 ; br i1 %brmerge, label %.exit, label %.preheader
 ;
 ;.exit:
 ; ret i32 %.ret
 ;}
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
	; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 < %s \| FileCheck -check-prefix=GFX10 %s

	; Simples case, if - then, that requires lane mask merging,
	; %phi lane mask will hold %val_A at %A. Lanes that are active in %B
	; will overwrite its own lane bit in lane mask with val_B
	define amdgpu_ps void @divergent_i1_phi_if_then(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
	; GFX10-LABEL: divergent_i1_phi_if_then:
	; GFX10: ; %bb.0: ; %A
	; GFX10-NEXT: v_cmp_le_u32_e64 s0, 6, v2
	; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
	; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
	; GFX10-NEXT: ; %bb.1: ; %B
	; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 1, v2
	; GFX10-NEXT: ; %bb.2: ; %exit
	; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
	; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s0
	; GFX10-NEXT: v_add_nc_u32_e32 v2, 2, v2
	; GFX10-NEXT: global_store_dword v[0:1], v2, off
	; GFX10-NEXT: s_endpgm
	A:
	%val_A = icmp uge i32 %tid, 6
	%cmp = icmp eq i32 %cond, 0
	br i1 %cmp, label %B, label %exit

	B:
	%val_B = icmp ult i32 %tid, 1
	br label %exit

	exit:
	%phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
	%sel = select i1 %phi, i32 1, i32 2
	store i32 %sel, ptr addrspace(1) %out
	ret void
	}

	; if - else
	define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
	; GFX10-LABEL: divergent_i1_phi_if_else:
	; GFX10: ; %bb.0: ; %entry
	; GFX10-NEXT: s_and_b32 s0, 1, s0
	; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
	; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
	; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
	; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1
	; GFX10-NEXT: ; %bb.1: ; %B
	; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 2, v2
	; GFX10-NEXT: ; implicit-def: $vgpr2
	; GFX10-NEXT: ; %bb.2: ; %Flow
	; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1
	; GFX10-NEXT: ; %bb.3: ; %A
	; GFX10-NEXT: v_cmp_le_u32_e64 s0, 1, v2
	; GFX10-NEXT: ; %bb.4: ; %exit
	; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
	; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s0
	; GFX10-NEXT: v_add_nc_u32_e32 v2, 2, v2
	; GFX10-NEXT: global_store_dword v[0:1], v2, off
	; GFX10-NEXT: s_endpgm
	entry:
	%cmp = icmp eq i32 %cond, 0
	br i1 %cmp, label %A, label %B

	A:
	%val_A = icmp uge i32 %tid, 1
	br label %exit

	B:
	%val_B = icmp ult i32 %tid, 2
	br label %exit

	exit:
	%phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
	%sel = select i1 %phi, i32 1, i32 2
	store i32 %sel, ptr addrspace(1) %out
	ret void
	}

	; if - break;

	; counter = 0;
	; do {
	; if (a[counter] == 0)
	; break;
	; if (b[counter] == 0)
	; break;
	; if (c[counter] == 0)
	; break;
	; x[counter++]+=1;
	; } while (counter<100);

	; Tests with multiple break conditions. Divergent phis will be used to track
	; if any of the break conditions was reached. We only need to do simple lane
	; mask merging (for current loop iteration only). There is an intrinsic,
	; if_break, that will merge lane masks across all iterations of the loop.

	define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a) {
	; GFX10-LABEL: loop_with_1break:
	; GFX10: ; %bb.0: ; %entry
	; GFX10-NEXT: s_mov_b32 s0, 0
	; GFX10-NEXT: v_mov_b32_e32 v4, s0
	; GFX10-NEXT: s_branch .LBB2_2
	; GFX10-NEXT: .LBB2_1: ; %Flow
	; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1
	; GFX10-NEXT: s_waitcnt_depctr 0xffe3
	; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
	; GFX10-NEXT: s_and_b32 s1, exec_lo, s2
	; GFX10-NEXT: s_or_b32 s0, s1, s0
	; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
	; GFX10-NEXT: s_cbranch_execz .LBB2_4
	; GFX10-NEXT: .LBB2_2: ; %A
	; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
	; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4
	; GFX10-NEXT: s_mov_b32 s2, -1
	; GFX10-NEXT: v_lshlrev_b64 v[5:6], 2, v[4:5]
	; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v2, v5
	; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v3, v6, vcc_lo
	; GFX10-NEXT: global_load_dword v7, v[7:8], off
	; GFX10-NEXT: s_waitcnt vmcnt(0)
	; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v7
	; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
	; GFX10-NEXT: s_cbranch_execz .LBB2_1
	; GFX10-NEXT: ; %bb.3: ; %loop.body
	; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1
	; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v0, v5
	; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v1, v6, vcc_lo
	; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v4
	; GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, v4
	; GFX10-NEXT: global_load_dword v7, v[5:6], off
	; GFX10-NEXT: v_mov_b32_e32 v4, v8
	; GFX10-NEXT: s_waitcnt vmcnt(0)
	; GFX10-NEXT: v_add_nc_u32_e32 v7, 1, v7
	; GFX10-NEXT: global_store_dword v[5:6], v7, off
	; GFX10-NEXT: s_branch .LBB2_1
	; GFX10-NEXT: .LBB2_4: ; %exit
	; GFX10-NEXT: s_endpgm
	entry:
	br label %A

	A:
	%counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
	%a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
	%a.val = load i32, ptr addrspace(1) %a.plus.counter
	%a.cond = icmp eq i32 %a.val, 0
	br i1 %a.cond, label %exit, label %loop.body

	loop.body:
	%x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
	%x.val = load i32, ptr addrspace(1) %x.plus.counter
	%x.val.plus.1 = add i32 %x.val, 1
	store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
	%counter.plus.1 = add i32 %counter, 1
	%x.cond = icmp ult i32 %counter, 100
	br i1 %x.cond, label %exit, label %A

	exit:
	ret void
	}

	define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b) {
	; GFX10-LABEL: loop_with_2breaks:
	; GFX10: ; %bb.0: ; %entry
	; GFX10-NEXT: s_mov_b32 s0, 0
	; GFX10-NEXT: v_mov_b32_e32 v6, s0
	; GFX10-NEXT: s_branch .LBB3_3
	; GFX10-NEXT: .LBB3_1: ; %Flow3
	; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
	; GFX10-NEXT: s_waitcnt_depctr 0xffe3
	; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3
	; GFX10-NEXT: .LBB3_2: ; %Flow
	; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
	; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
	; GFX10-NEXT: s_and_b32 s1, exec_lo, s2
	; GFX10-NEXT: s_or_b32 s0, s1, s0
	; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
	; GFX10-NEXT: s_cbranch_execz .LBB3_6
	; GFX10-NEXT: .LBB3_3: ; %A
	; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
	; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6
	; GFX10-NEXT: s_mov_b32 s2, -1
	; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7]
	; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7
	; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
	; GFX10-NEXT: global_load_dword v9, v[9:10], off
	; GFX10-NEXT: s_waitcnt vmcnt(0)
	; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9
	; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
	; GFX10-NEXT: s_cbranch_execz .LBB3_2
	; GFX10-NEXT: ; %bb.4: ; %B
	; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
	; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v4, v7
	; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo
	; GFX10-NEXT: global_load_dword v9, v[9:10], off
	; GFX10-NEXT: s_waitcnt vmcnt(0)
	; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9
	; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo
	; GFX10-NEXT: s_cbranch_execz .LBB3_1
	; GFX10-NEXT: ; %bb.5: ; %loop.body
	; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
	; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7
	; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
	; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v6
	; GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, v6
	; GFX10-NEXT: global_load_dword v9, v[7:8], off
	; GFX10-NEXT: v_mov_b32_e32 v6, v10
	; GFX10-NEXT: s_waitcnt vmcnt(0)
	; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9
	; GFX10-NEXT: global_store_dword v[7:8], v9, off
	; GFX10-NEXT: s_branch .LBB3_1
	; GFX10-NEXT: .LBB3_6: ; %exit
	; GFX10-NEXT: s_endpgm
	entry:
	br label %A

	A:
	%counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
	%a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
	%a.val = load i32, ptr addrspace(1) %a.plus.counter
	%a.cond = icmp eq i32 %a.val, 0
	br i1 %a.cond, label %exit, label %B

	B:
	%b.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %b, i32 %counter
	%b.val = load i32, ptr addrspace(1) %b.plus.counter
	%b.cond = icmp eq i32 %b.val, 0
	br i1 %b.cond, label %exit, label %loop.body

	loop.body:
	%x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
	%x.val = load i32, ptr addrspace(1) %x.plus.counter
	%x.val.plus.1 = add i32 %x.val, 1
	store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
	%counter.plus.1 = add i32 %counter, 1
	%x.cond = icmp ult i32 %counter, 100
	br i1 %x.cond, label %exit, label %A

	exit:
	ret void
	}

	define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c) {
	; GFX10-LABEL: loop_with_3breaks:
	; GFX10: ; %bb.0: ; %entry
	; GFX10-NEXT: s_mov_b32 s0, 0
	; GFX10-NEXT: v_mov_b32_e32 v8, s0
	; GFX10-NEXT: s_branch .LBB4_4
	; GFX10-NEXT: .LBB4_1: ; %Flow5
	; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
	; GFX10-NEXT: s_waitcnt_depctr 0xffe3
	; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
	; GFX10-NEXT: .LBB4_2: ; %Flow4
	; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
	; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3
	; GFX10-NEXT: .LBB4_3: ; %Flow
	; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
	; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
	; GFX10-NEXT: s_and_b32 s1, exec_lo, s2
	; GFX10-NEXT: s_or_b32 s0, s1, s0
	; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
	; GFX10-NEXT: s_cbranch_execz .LBB4_8
	; GFX10-NEXT: .LBB4_4: ; %A
	; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
	; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v8
	; GFX10-NEXT: s_mov_b32 s2, -1
	; GFX10-NEXT: v_lshlrev_b64 v[9:10], 2, v[8:9]
	; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v2, v9
	; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v3, v10, vcc_lo
	; GFX10-NEXT: global_load_dword v11, v[11:12], off
	; GFX10-NEXT: s_waitcnt vmcnt(0)
	; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11
	; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
	; GFX10-NEXT: s_cbranch_execz .LBB4_3
	; GFX10-NEXT: ; %bb.5: ; %B
	; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
	; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v4, v9
	; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v5, v10, vcc_lo
	; GFX10-NEXT: global_load_dword v11, v[11:12], off
	; GFX10-NEXT: s_waitcnt vmcnt(0)
	; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11
	; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo
	; GFX10-NEXT: s_cbranch_execz .LBB4_2
	; GFX10-NEXT: ; %bb.6: ; %C
	; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
	; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v6, v9
	; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v7, v10, vcc_lo
	; GFX10-NEXT: global_load_dword v11, v[11:12], off
	; GFX10-NEXT: s_waitcnt vmcnt(0)
	; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11
	; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
	; GFX10-NEXT: s_cbranch_execz .LBB4_1
	; GFX10-NEXT: ; %bb.7: ; %loop.body
	; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
	; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v0, v9
	; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v1, v10, vcc_lo
	; GFX10-NEXT: v_add_nc_u32_e32 v12, 1, v8
	; GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, v8
	; GFX10-NEXT: global_load_dword v11, v[9:10], off
	; GFX10-NEXT: v_mov_b32_e32 v8, v12
	; GFX10-NEXT: s_waitcnt vmcnt(0)
	; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v11
	; GFX10-NEXT: global_store_dword v[9:10], v11, off
	; GFX10-NEXT: s_branch .LBB4_1
	; GFX10-NEXT: .LBB4_8: ; %exit
	; GFX10-NEXT: s_endpgm
	entry:
	br label %A

	A:
	%counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
	%a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
	%a.val = load i32, ptr addrspace(1) %a.plus.counter
	%a.cond = icmp eq i32 %a.val, 0
	br i1 %a.cond, label %exit, label %B

	B:
	%b.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %b, i32 %counter
	%b.val = load i32, ptr addrspace(1) %b.plus.counter
	%b.cond = icmp eq i32 %b.val, 0
	br i1 %b.cond, label %exit, label %C

	C:
	%c.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %c, i32 %counter
	%c.val = load i32, ptr addrspace(1) %c.plus.counter
	%c.cond = icmp eq i32 %c.val, 0
	br i1 %c.cond, label %exit, label %loop.body

	loop.body:
	%x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
	%x.val = load i32, ptr addrspace(1) %x.plus.counter
	%x.val.plus.1 = add i32 %x.val, 1
	store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
	%counter.plus.1 = add i32 %counter, 1
	%x.cond = icmp ult i32 %counter, 100
	br i1 %x.cond, label %exit, label %A

	exit:
	ret void
	}

	; Divergent condition if with body, ending with break. This is loop with two
	; exits but structurizer will create phi that will track exit from break
	; and move break.body after the loop. Loop will then have one exit and phi
	; used outside of the loop by condition used to enter the break.body.
	define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) {
	; GFX10-LABEL: loop_with_div_break_with_body:
	; GFX10: ; %bb.0: ; %entry
	; GFX10-NEXT: s_mov_b32 s0, 0
	; GFX10-NEXT: v_mov_b32_e32 v6, s0
	; GFX10-NEXT: s_branch .LBB5_2
	; GFX10-NEXT: .LBB5_1: ; %Flow
	; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
	; GFX10-NEXT: s_waitcnt_depctr 0xffe3
	; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
	; GFX10-NEXT: s_and_b32 s1, exec_lo, s2
	; GFX10-NEXT: s_or_b32 s0, s1, s0
	; GFX10-NEXT: s_and_b32 s1, 1, s3
	; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, s1
	; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
	; GFX10-NEXT: s_cbranch_execz .LBB5_4
	; GFX10-NEXT: .LBB5_2: ; %A
	; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
	; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6
	; GFX10-NEXT: s_mov_b32 s2, -1
	; GFX10-NEXT: s_mov_b32 s3, 1
	; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7]
	; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7
	; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
	; GFX10-NEXT: global_load_dword v9, v[9:10], off
	; GFX10-NEXT: s_waitcnt vmcnt(0)
	; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9
	; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
	; GFX10-NEXT: s_cbranch_execz .LBB5_1
	; GFX10-NEXT: ; %bb.3: ; %loop.body
	; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
	; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7
	; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
	; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v6
	; GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, v6
	; GFX10-NEXT: s_mov_b32 s3, 0
	; GFX10-NEXT: global_load_dword v9, v[7:8], off
	; GFX10-NEXT: v_mov_b32_e32 v6, v10
	; GFX10-NEXT: s_waitcnt vmcnt(0)
	; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9
	; GFX10-NEXT: global_store_dword v[7:8], v9, off
	; GFX10-NEXT: s_branch .LBB5_1
	; GFX10-NEXT: .LBB5_4: ; %loop.exit.guard
	; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
	; GFX10-NEXT: s_and_saveexec_b32 s0, s1
	; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0
	; GFX10-NEXT: s_cbranch_execz .LBB5_6
	; GFX10-NEXT: ; %bb.5: ; %break.body
	; GFX10-NEXT: v_mov_b32_e32 v0, 10
	; GFX10-NEXT: global_store_dword v[4:5], v0, off
	; GFX10-NEXT: .LBB5_6: ; %exit
	; GFX10-NEXT: s_endpgm
	entry:
	br label %A

	A:
	%counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
	%a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
	%a.val = load i32, ptr addrspace(1) %a.plus.counter
	%a.cond = icmp eq i32 %a.val, 0
	br i1 %a.cond, label %break.body, label %loop.body

	break.body:
	store i32 10, ptr addrspace(1) %a.break
	br label %exit


	loop.body:
	%x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
	%x.val = load i32, ptr addrspace(1) %x.plus.counter
	%x.val.plus.1 = add i32 %x.val, 1
	store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
	%counter.plus.1 = add i32 %counter, 1
	%x.cond = icmp ult i32 %counter, 100
	br i1 %x.cond, label %exit, label %A

	exit:
	ret void
	}

	; Snippet from test generated by the GraphicsFuzz tool, frontend generates ir
	; with irreducible control flow graph. FixIrreducible converts it into natural
	; loop and in the process creates i1 phi with three incoming values.

	; int loop(int x, int y, int a0, int a1, int a2, int a3, int a4) {
	; do {
	; if (y < a2) {
	; do {
	; } while (x < a2);
	; }
	; if (x < a3) {
	; return a1;
	; }
	; } while (y < a2);
	; return a0;
	; }

	; This test is also interesting because it has phi with three incomings
	;define amdgpu_ps i32 @irreducible_cfg(i32 %x, i32 %y, i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
	;.entry:
	; %.y_lt_a2 = icmp sgt i32 %a2, %y
	; %.x_lt_a2 = icmp sgt i32 %a2, %x
	; %.x_lt_a3 = icmp sgt i32 %a3, %x
	; br i1 %.y_lt_a2, label %.preheader, label %.loopexit ; first iteration, jump to inner loop if 'y < a2' or start with 'if (x < a3)'
	;
	;.preheader: ; if (y < a2),
	; br label %.inner_loop
	;
	;.inner_loop: ; do while x < a2
	; br i1 %.x_lt_a2, label %.inner_loop, label %.loopexit
	;
	;.loopexit: ; if x < a3
	; %not.inner_loop = xor i1 %.y_lt_a2, true
	; %brmerge = select i1 %.x_lt_a3, i1 true, i1 %not.inner_loop ; exit loop if 'x < a3' or 'loop ends since !(y < a2)'
	; %.ret = select i1 %.x_lt_a3, i32 %a1, i32 %a0 ; select retrun value a1 'x < a3' or a0 'loop ends'
	; br i1 %brmerge, label %.exit, label %.preheader
	;
	;.exit:
	; ret i32 %.ret
	;}