|  | ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | 
|  | ; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes OPT | 
|  | ; RUN: llc < %s --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes PTX | 
|  |  | 
|  | %struct.uint4 = type { i32, i32, i32, i32 } | 
|  |  | 
|  | @gi = dso_local addrspace(1) externally_initialized global %struct.uint4 { i32 50462976, i32 117835012, i32 185207048, i32 252579084 }, align 16 | 
|  |  | 
|  | ; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none) | 
|  | ; Regular functions mus still make a copy. `cvta.param` does not always work there. | 
|  | define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly byval(%struct.uint4) align 16 %a, i1 noundef zeroext %b, i32 noundef %c) local_unnamed_addr #0 { | 
|  | ; OPT-LABEL: define dso_local noundef i32 @non_kernel_function( | 
|  | ; OPT-SAME: ptr noundef readonly byval([[STRUCT_UINT4:%.*]]) align 16 captures(none) [[A:%.*]], i1 noundef zeroext [[B:%.*]], i32 noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { | 
|  | ; OPT-NEXT:  [[ENTRY:.*:]] | 
|  | ; OPT-NEXT:    [[A_:%.*]] = select i1 [[B]], ptr [[A]], ptr addrspacecast (ptr addrspace(1) @gi to ptr) | 
|  | ; OPT-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[C]] to i64 | 
|  | ; OPT-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[A_]], i64 [[IDX_EXT]] | 
|  | ; OPT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ADD_PTR]], align 1 | 
|  | ; OPT-NEXT:    ret i32 [[TMP0]] | 
|  | ; | 
|  | ; PTX-LABEL: non_kernel_function( | 
|  | ; PTX:       { | 
|  | ; PTX-NEXT:    .reg .pred %p<2>; | 
|  | ; PTX-NEXT:    .reg .b16 %rs<3>; | 
|  | ; PTX-NEXT:    .reg .b32 %r<11>; | 
|  | ; PTX-NEXT:    .reg .b64 %rd<8>; | 
|  | ; PTX-EMPTY: | 
|  | ; PTX-NEXT:  // %bb.0: // %entry | 
|  | ; PTX-NEXT:    mov.b64 %rd1, non_kernel_function_param_0; | 
|  | ; PTX-NEXT:    cvta.local.u64 %rd2, %rd1; | 
|  | ; PTX-NEXT:    ld.param.b8 %rs1, [non_kernel_function_param_1]; | 
|  | ; PTX-NEXT:    and.b16 %rs2, %rs1, 1; | 
|  | ; PTX-NEXT:    setp.ne.b16 %p1, %rs2, 0; | 
|  | ; PTX-NEXT:    mov.b64 %rd3, gi; | 
|  | ; PTX-NEXT:    cvta.global.u64 %rd4, %rd3; | 
|  | ; PTX-NEXT:    selp.b64 %rd5, %rd2, %rd4, %p1; | 
|  | ; PTX-NEXT:    ld.param.s32 %rd6, [non_kernel_function_param_2]; | 
|  | ; PTX-NEXT:    add.s64 %rd7, %rd5, %rd6; | 
|  | ; PTX-NEXT:    ld.b8 %r1, [%rd7]; | 
|  | ; PTX-NEXT:    ld.b8 %r2, [%rd7+1]; | 
|  | ; PTX-NEXT:    shl.b32 %r3, %r2, 8; | 
|  | ; PTX-NEXT:    or.b32 %r4, %r3, %r1; | 
|  | ; PTX-NEXT:    ld.b8 %r5, [%rd7+2]; | 
|  | ; PTX-NEXT:    shl.b32 %r6, %r5, 16; | 
|  | ; PTX-NEXT:    ld.b8 %r7, [%rd7+3]; | 
|  | ; PTX-NEXT:    shl.b32 %r8, %r7, 24; | 
|  | ; PTX-NEXT:    or.b32 %r9, %r8, %r6; | 
|  | ; PTX-NEXT:    or.b32 %r10, %r9, %r4; | 
|  | ; PTX-NEXT:    st.param.b32 [func_retval0], %r10; | 
|  | ; PTX-NEXT:    ret; | 
|  | entry: | 
|  | %a. = select i1 %b, ptr %a, ptr addrspacecast (ptr addrspace(1) @gi to ptr) | 
|  | %idx.ext = sext i32 %c to i64 | 
|  | %add.ptr = getelementptr inbounds i8, ptr %a., i64 %idx.ext | 
|  | %0 = load i32, ptr %add.ptr, align 1 | 
|  | ret i32 %0 | 
|  | } | 
|  |  | 
|  | define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 "nvvm.grid_constant" %input1, i32 %input2, ptr %out, i32 %n) { | 
|  | ; PTX-LABEL: grid_const_int( | 
|  | ; PTX:       { | 
|  | ; PTX-NEXT:    .reg .b32 %r<4>; | 
|  | ; PTX-NEXT:    .reg .b64 %rd<3>; | 
|  | ; PTX-EMPTY: | 
|  | ; PTX-NEXT:  // %bb.0: | 
|  | ; PTX-NEXT:    ld.param.b64 %rd1, [grid_const_int_param_2]; | 
|  | ; PTX-NEXT:    cvta.to.global.u64 %rd2, %rd1; | 
|  | ; PTX-NEXT:    ld.param.b32 %r1, [grid_const_int_param_1]; | 
|  | ; PTX-NEXT:    ld.param.b32 %r2, [grid_const_int_param_0]; | 
|  | ; PTX-NEXT:    add.s32 %r3, %r2, %r1; | 
|  | ; PTX-NEXT:    st.global.b32 [%rd2], %r3; | 
|  | ; PTX-NEXT:    ret; | 
|  | ; OPT-LABEL: define ptx_kernel void @grid_const_int( | 
|  | ; OPT-SAME: ptr byval(i32) align 4 "nvvm.grid_constant" [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { | 
|  | ; OPT-NEXT:    [[INPUT11:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) | 
|  | ; OPT-NEXT:    [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4 | 
|  | ; OPT-NEXT:    [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]] | 
|  | ; OPT-NEXT:    store i32 [[ADD]], ptr [[OUT]], align 4 | 
|  | ; OPT-NEXT:    ret void | 
|  | %tmp = load i32, ptr %input1, align 4 | 
|  | %add = add i32 %tmp, %input2 | 
|  | store i32 %add, ptr %out | 
|  | ret void | 
|  | } | 
|  |  | 
|  | %struct.s = type { i32, i32 } | 
|  |  | 
|  | define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 "nvvm.grid_constant" %input, ptr %out){ | 
|  | ; PTX-LABEL: grid_const_struct( | 
|  | ; PTX:       { | 
|  | ; PTX-NEXT:    .reg .b32 %r<4>; | 
|  | ; PTX-NEXT:    .reg .b64 %rd<3>; | 
|  | ; PTX-EMPTY: | 
|  | ; PTX-NEXT:  // %bb.0: | 
|  | ; PTX-NEXT:    ld.param.b64 %rd1, [grid_const_struct_param_1]; | 
|  | ; PTX-NEXT:    cvta.to.global.u64 %rd2, %rd1; | 
|  | ; PTX-NEXT:    ld.param.b32 %r1, [grid_const_struct_param_0]; | 
|  | ; PTX-NEXT:    ld.param.b32 %r2, [grid_const_struct_param_0+4]; | 
|  | ; PTX-NEXT:    add.s32 %r3, %r1, %r2; | 
|  | ; PTX-NEXT:    st.global.b32 [%rd2], %r3; | 
|  | ; PTX-NEXT:    ret; | 
|  | ; OPT-LABEL: define ptx_kernel void @grid_const_struct( | 
|  | ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 "nvvm.grid_constant" [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] { | 
|  | ; OPT-NEXT:    [[INPUT1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) | 
|  | ; OPT-NEXT:    [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0 | 
|  | ; OPT-NEXT:    [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1 | 
|  | ; OPT-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4 | 
|  | ; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4 | 
|  | ; OPT-NEXT:    [[ADD:%.*]] = add i32 [[TMP1]], [[TMP2]] | 
|  | ; OPT-NEXT:    store i32 [[ADD]], ptr [[OUT]], align 4 | 
|  | ; OPT-NEXT:    ret void | 
|  | %gep1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 | 
|  | %gep2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1 | 
|  | %int1 = load i32, ptr %gep1 | 
|  | %int2 = load i32, ptr %gep2 | 
|  | %add = add i32 %int1, %int2 | 
|  | store i32 %add, ptr %out | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 "nvvm.grid_constant" %input) { | 
|  | ; PTX-LABEL: grid_const_escape( | 
|  | ; PTX:       { | 
|  | ; PTX-NEXT:    .reg .b64 %rd<4>; | 
|  | ; PTX-EMPTY: | 
|  | ; PTX-NEXT:  // %bb.0: | 
|  | ; PTX-NEXT:    mov.b64 %rd1, grid_const_escape_param_0; | 
|  | ; PTX-NEXT:    cvta.param.u64 %rd2, %rd1; | 
|  | ; PTX-NEXT:    { // callseq 0, 0 | 
|  | ; PTX-NEXT:    .param .b64 param0; | 
|  | ; PTX-NEXT:    .param .b32 retval0; | 
|  | ; PTX-NEXT:    st.param.b64 [param0], %rd2; | 
|  | ; PTX-NEXT:    prototype_0 : .callprototype (.param .b32 _) _ (.param .b64 _); | 
|  | ; PTX-NEXT:    mov.b64 %rd3, escape; | 
|  | ; PTX-NEXT:    call (retval0), %rd3, (param0), prototype_0; | 
|  | ; PTX-NEXT:    } // callseq 0 | 
|  | ; PTX-NEXT:    ret; | 
|  | ; OPT-LABEL: define ptx_kernel void @grid_const_escape( | 
|  | ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 "nvvm.grid_constant" [[INPUT:%.*]]) #[[ATTR0]] { | 
|  | ; OPT-NEXT:    [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) | 
|  | ; OPT-NEXT:    [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr | 
|  | ; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]]) | 
|  | ; OPT-NEXT:    ret void | 
|  | %call = call i32 @escape(ptr %input) | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 "nvvm.grid_constant" %input, i32 %a, ptr byval(i32) align 4 "nvvm.grid_constant" %b) { | 
|  | ; PTX-LABEL: multiple_grid_const_escape( | 
|  | ; PTX:       { | 
|  | ; PTX-NEXT:    .local .align 4 .b8 __local_depot4[4]; | 
|  | ; PTX-NEXT:    .reg .b64 %SP; | 
|  | ; PTX-NEXT:    .reg .b64 %SPL; | 
|  | ; PTX-NEXT:    .reg .b32 %r<2>; | 
|  | ; PTX-NEXT:    .reg .b64 %rd<8>; | 
|  | ; PTX-EMPTY: | 
|  | ; PTX-NEXT:  // %bb.0: | 
|  | ; PTX-NEXT:    mov.b64 %SPL, __local_depot4; | 
|  | ; PTX-NEXT:    cvta.local.u64 %SP, %SPL; | 
|  | ; PTX-NEXT:    mov.b64 %rd1, multiple_grid_const_escape_param_0; | 
|  | ; PTX-NEXT:    ld.param.b32 %r1, [multiple_grid_const_escape_param_1]; | 
|  | ; PTX-NEXT:    mov.b64 %rd2, multiple_grid_const_escape_param_2; | 
|  | ; PTX-NEXT:    cvta.param.u64 %rd3, %rd2; | 
|  | ; PTX-NEXT:    cvta.param.u64 %rd4, %rd1; | 
|  | ; PTX-NEXT:    add.u64 %rd5, %SP, 0; | 
|  | ; PTX-NEXT:    add.u64 %rd6, %SPL, 0; | 
|  | ; PTX-NEXT:    st.local.b32 [%rd6], %r1; | 
|  | ; PTX-NEXT:    { // callseq 1, 0 | 
|  | ; PTX-NEXT:    .param .b64 param0; | 
|  | ; PTX-NEXT:    .param .b64 param1; | 
|  | ; PTX-NEXT:    .param .b64 param2; | 
|  | ; PTX-NEXT:    .param .b32 retval0; | 
|  | ; PTX-NEXT:    st.param.b64 [param2], %rd3; | 
|  | ; PTX-NEXT:    st.param.b64 [param1], %rd5; | 
|  | ; PTX-NEXT:    st.param.b64 [param0], %rd4; | 
|  | ; PTX-NEXT:    prototype_1 : .callprototype (.param .b32 _) _ (.param .b64 _, .param .b64 _, .param .b64 _); | 
|  | ; PTX-NEXT:    mov.b64 %rd7, escape3; | 
|  | ; PTX-NEXT:    call (retval0), %rd7, (param0, param1, param2), prototype_1; | 
|  | ; PTX-NEXT:    } // callseq 1 | 
|  | ; PTX-NEXT:    ret; | 
|  | ; OPT-LABEL: define ptx_kernel void @multiple_grid_const_escape( | 
|  | ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 "nvvm.grid_constant" [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 "nvvm.grid_constant" [[B:%.*]]) #[[ATTR0]] { | 
|  | ; OPT-NEXT:    [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[B]]) | 
|  | ; OPT-NEXT:    [[B_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr | 
|  | ; OPT-NEXT:    [[TMP2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) | 
|  | ; OPT-NEXT:    [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr | 
|  | ; OPT-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4 | 
|  | ; OPT-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4 | 
|  | ; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr [[A_ADDR]], ptr [[B_PARAM_GEN]]) | 
|  | ; OPT-NEXT:    ret void | 
|  | %a.addr = alloca i32, align 4 | 
|  | store i32 %a, ptr %a.addr, align 4 | 
|  | %call = call i32 @escape3(ptr %input, ptr %a.addr, ptr %b) | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 "nvvm.grid_constant" %input, ptr %addr) { | 
|  | ; PTX-LABEL: grid_const_memory_escape( | 
|  | ; PTX:       { | 
|  | ; PTX-NEXT:    .reg .b64 %rd<5>; | 
|  | ; PTX-EMPTY: | 
|  | ; PTX-NEXT:  // %bb.0: | 
|  | ; PTX-NEXT:    mov.b64 %rd1, grid_const_memory_escape_param_0; | 
|  | ; PTX-NEXT:    ld.param.b64 %rd2, [grid_const_memory_escape_param_1]; | 
|  | ; PTX-NEXT:    cvta.to.global.u64 %rd3, %rd2; | 
|  | ; PTX-NEXT:    cvta.param.u64 %rd4, %rd1; | 
|  | ; PTX-NEXT:    st.global.b64 [%rd3], %rd4; | 
|  | ; PTX-NEXT:    ret; | 
|  | ; OPT-LABEL: define ptx_kernel void @grid_const_memory_escape( | 
|  | ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 "nvvm.grid_constant" [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] { | 
|  | ; OPT-NEXT:    [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) | 
|  | ; OPT-NEXT:    [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr | 
|  | ; OPT-NEXT:    store ptr [[INPUT1]], ptr [[ADDR]], align 8 | 
|  | ; OPT-NEXT:    ret void | 
|  | store ptr %input, ptr %addr, align 8 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 "nvvm.grid_constant" %input, ptr %result) { | 
|  | ; PTX-LABEL: grid_const_inlineasm_escape( | 
|  | ; PTX:       { | 
|  | ; PTX-NEXT:    .reg .b64 %rd<7>; | 
|  | ; PTX-EMPTY: | 
|  | ; PTX-NEXT:  // %bb.0: | 
|  | ; PTX-NEXT:    mov.b64 %rd4, grid_const_inlineasm_escape_param_0; | 
|  | ; PTX-NEXT:    ld.param.b64 %rd5, [grid_const_inlineasm_escape_param_1]; | 
|  | ; PTX-NEXT:    cvta.to.global.u64 %rd6, %rd5; | 
|  | ; PTX-NEXT:    cvta.param.u64 %rd2, %rd4; | 
|  | ; PTX-NEXT:    add.s64 %rd3, %rd2, 4; | 
|  | ; PTX-NEXT:    // begin inline asm | 
|  | ; PTX-NEXT:    add.s64 %rd1, %rd2, %rd3; | 
|  | ; PTX-NEXT:    // end inline asm | 
|  | ; PTX-NEXT:    st.global.b64 [%rd6], %rd1; | 
|  | ; PTX-NEXT:    ret; | 
|  | ; PTX-NOT      .local | 
|  | ; OPT-LABEL: define ptx_kernel void @grid_const_inlineasm_escape( | 
|  | ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 "nvvm.grid_constant" [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] { | 
|  | ; OPT-NEXT:    [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) | 
|  | ; OPT-NEXT:    [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr | 
|  | ; OPT-NEXT:    [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0 | 
|  | ; OPT-NEXT:    [[TMPPTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1 | 
|  | ; OPT-NEXT:    [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2 | 
|  | ; OPT-NEXT:    store i64 [[TMP2]], ptr [[RESULT]], align 8 | 
|  | ; OPT-NEXT:    ret void | 
|  | %tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 | 
|  | %tmpptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1 | 
|  | %1 = call i64 asm "add.s64 $0, $1, $2;", "=l,l,l"(ptr %tmpptr1, ptr %tmpptr2) #1 | 
|  | store i64 %1, ptr %result, align 8 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) "nvvm.grid_constant" %input, ptr %output) { | 
|  | ; PTX-LABEL: grid_const_partial_escape( | 
|  | ; PTX:       { | 
|  | ; PTX-NEXT:    .reg .b32 %r<3>; | 
|  | ; PTX-NEXT:    .reg .b64 %rd<6>; | 
|  | ; PTX-EMPTY: | 
|  | ; PTX-NEXT:  // %bb.0: | 
|  | ; PTX-NEXT:    mov.b64 %rd1, grid_const_partial_escape_param_0; | 
|  | ; PTX-NEXT:    ld.param.b64 %rd2, [grid_const_partial_escape_param_1]; | 
|  | ; PTX-NEXT:    cvta.to.global.u64 %rd3, %rd2; | 
|  | ; PTX-NEXT:    cvta.param.u64 %rd4, %rd1; | 
|  | ; PTX-NEXT:    ld.param.b32 %r1, [grid_const_partial_escape_param_0]; | 
|  | ; PTX-NEXT:    add.s32 %r2, %r1, %r1; | 
|  | ; PTX-NEXT:    st.global.b32 [%rd3], %r2; | 
|  | ; PTX-NEXT:    { // callseq 2, 0 | 
|  | ; PTX-NEXT:    .param .b64 param0; | 
|  | ; PTX-NEXT:    .param .b32 retval0; | 
|  | ; PTX-NEXT:    st.param.b64 [param0], %rd4; | 
|  | ; PTX-NEXT:    prototype_2 : .callprototype (.param .b32 _) _ (.param .b64 _); | 
|  | ; PTX-NEXT:    mov.b64 %rd5, escape; | 
|  | ; PTX-NEXT:    call (retval0), %rd5, (param0), prototype_2; | 
|  | ; PTX-NEXT:    } // callseq 2 | 
|  | ; PTX-NEXT:    ret; | 
|  | ; OPT-LABEL: define ptx_kernel void @grid_const_partial_escape( | 
|  | ; OPT-SAME: ptr byval(i32) "nvvm.grid_constant" [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { | 
|  | ; OPT-NEXT:    [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) | 
|  | ; OPT-NEXT:    [[INPUT1_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr | 
|  | ; OPT-NEXT:    [[VAL1:%.*]] = load i32, ptr [[INPUT1_GEN]], align 4 | 
|  | ; OPT-NEXT:    [[TWICE:%.*]] = add i32 [[VAL1]], [[VAL1]] | 
|  | ; OPT-NEXT:    store i32 [[TWICE]], ptr [[OUTPUT]], align 4 | 
|  | ; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]]) | 
|  | ; OPT-NEXT:    ret void | 
|  | %val = load i32, ptr %input | 
|  | %twice = add i32 %val, %val | 
|  | store i32 %twice, ptr %output | 
|  | %call = call i32 @escape(ptr %input) | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) "nvvm.grid_constant" %input, ptr %output) { | 
|  | ; PTX-LABEL: grid_const_partial_escapemem( | 
|  | ; PTX:       { | 
|  | ; PTX-NEXT:    .reg .b32 %r<4>; | 
|  | ; PTX-NEXT:    .reg .b64 %rd<6>; | 
|  | ; PTX-EMPTY: | 
|  | ; PTX-NEXT:  // %bb.0: | 
|  | ; PTX-NEXT:    mov.b64 %rd1, grid_const_partial_escapemem_param_0; | 
|  | ; PTX-NEXT:    ld.param.b64 %rd2, [grid_const_partial_escapemem_param_1]; | 
|  | ; PTX-NEXT:    cvta.to.global.u64 %rd3, %rd2; | 
|  | ; PTX-NEXT:    cvta.param.u64 %rd4, %rd1; | 
|  | ; PTX-NEXT:    ld.param.b32 %r1, [grid_const_partial_escapemem_param_0]; | 
|  | ; PTX-NEXT:    ld.param.b32 %r2, [grid_const_partial_escapemem_param_0+4]; | 
|  | ; PTX-NEXT:    st.global.b64 [%rd3], %rd4; | 
|  | ; PTX-NEXT:    add.s32 %r3, %r1, %r2; | 
|  | ; PTX-NEXT:    { // callseq 3, 0 | 
|  | ; PTX-NEXT:    .param .b64 param0; | 
|  | ; PTX-NEXT:    .param .b32 retval0; | 
|  | ; PTX-NEXT:    st.param.b64 [param0], %rd4; | 
|  | ; PTX-NEXT:    prototype_3 : .callprototype (.param .b32 _) _ (.param .b64 _); | 
|  | ; PTX-NEXT:    mov.b64 %rd5, escape; | 
|  | ; PTX-NEXT:    call (retval0), %rd5, (param0), prototype_3; | 
|  | ; PTX-NEXT:    } // callseq 3 | 
|  | ; PTX-NEXT:    st.param.b32 [func_retval0], %r3; | 
|  | ; PTX-NEXT:    ret; | 
|  | ; OPT-LABEL: define ptx_kernel i32 @grid_const_partial_escapemem( | 
|  | ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) "nvvm.grid_constant" [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { | 
|  | ; OPT-NEXT:    [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) | 
|  | ; OPT-NEXT:    [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr | 
|  | ; OPT-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0 | 
|  | ; OPT-NEXT:    [[VAL1:%.*]] = load i32, ptr [[PTR1]], align 4 | 
|  | ; OPT-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1 | 
|  | ; OPT-NEXT:    [[VAL2:%.*]] = load i32, ptr [[PTR2]], align 4 | 
|  | ; OPT-NEXT:    store ptr [[INPUT1]], ptr [[OUTPUT]], align 8 | 
|  | ; OPT-NEXT:    [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]] | 
|  | ; OPT-NEXT:    [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]]) | 
|  | ; OPT-NEXT:    ret i32 [[ADD]] | 
|  | %ptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 | 
|  | %val1 = load i32, ptr %ptr1 | 
|  | %ptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1 | 
|  | %val2 = load i32, ptr %ptr2 | 
|  | store ptr %input, ptr %output | 
|  | %add = add i32 %val1, %val2 | 
|  | %call2 = call i32 @escape(ptr %ptr1) | 
|  | ret i32 %add | 
|  | } | 
|  |  | 
|  | define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 "nvvm.grid_constant" %input1, ptr %inout) { | 
|  | ; PTX-LABEL: grid_const_phi( | 
|  | ; PTX:       { | 
|  | ; PTX-NEXT:    .reg .pred %p<2>; | 
|  | ; PTX-NEXT:    .reg .b32 %r<3>; | 
|  | ; PTX-NEXT:    .reg .b64 %rd<4>; | 
|  | ; PTX-EMPTY: | 
|  | ; PTX-NEXT:  // %bb.0: | 
|  | ; PTX-NEXT:    mov.b64 %rd3, grid_const_phi_param_0; | 
|  | ; PTX-NEXT:    ld.param.b64 %rd2, [grid_const_phi_param_1]; | 
|  | ; PTX-NEXT:    cvta.to.global.u64 %rd1, %rd2; | 
|  | ; PTX-NEXT:    ld.global.b32 %r1, [%rd1]; | 
|  | ; PTX-NEXT:    setp.lt.s32 %p1, %r1, 0; | 
|  | ; PTX-NEXT:    @%p1 bra $L__BB9_2; | 
|  | ; PTX-NEXT:  // %bb.1: // %second | 
|  | ; PTX-NEXT:    add.s64 %rd3, %rd3, 4; | 
|  | ; PTX-NEXT:  $L__BB9_2: // %merge | 
|  | ; PTX-NEXT:    ld.param.b32 %r2, [%rd3]; | 
|  | ; PTX-NEXT:    st.global.b32 [%rd1], %r2; | 
|  | ; PTX-NEXT:    ret; | 
|  | ; OPT-LABEL: define ptx_kernel void @grid_const_phi( | 
|  | ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 "nvvm.grid_constant" [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { | 
|  | ; OPT-NEXT:    [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) | 
|  | ; OPT-NEXT:    [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr | 
|  | ; OPT-NEXT:    [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4 | 
|  | ; OPT-NEXT:    [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 | 
|  | ; OPT-NEXT:    br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]] | 
|  | ; OPT:       [[FIRST]]: | 
|  | ; OPT-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0 | 
|  | ; OPT-NEXT:    br label %[[MERGE:.*]] | 
|  | ; OPT:       [[SECOND]]: | 
|  | ; OPT-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 1 | 
|  | ; OPT-NEXT:    br label %[[MERGE]] | 
|  | ; OPT:       [[MERGE]]: | 
|  | ; OPT-NEXT:    [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] | 
|  | ; OPT-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 | 
|  | ; OPT-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT]], align 4 | 
|  | ; OPT-NEXT:    ret void | 
|  |  | 
|  | %val = load i32, ptr %inout | 
|  | %less = icmp slt i32 %val, 0 | 
|  | br i1 %less, label %first, label %second | 
|  | first: | 
|  | %ptr1 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 0 | 
|  | br label %merge | 
|  | second: | 
|  | %ptr2 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 1 | 
|  | br label %merge | 
|  | merge: | 
|  | %ptrnew = phi ptr [%ptr1, %first], [%ptr2, %second] | 
|  | %valloaded = load i32, ptr %ptrnew | 
|  | store i32 %valloaded, ptr %inout | 
|  | ret void | 
|  | } | 
|  |  | 
|  | ; NOTE: %input2 is *not* grid_constant | 
|  | define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 "nvvm.grid_constant" %input1, ptr byval(%struct.s) %input2, ptr %inout) { | 
|  | ; PTX-LABEL: grid_const_phi_ngc( | 
|  | ; PTX:       { | 
|  | ; PTX-NEXT:    .reg .pred %p<2>; | 
|  | ; PTX-NEXT:    .reg .b32 %r<3>; | 
|  | ; PTX-NEXT:    .reg .b64 %rd<5>; | 
|  | ; PTX-EMPTY: | 
|  | ; PTX-NEXT:  // %bb.0: | 
|  | ; PTX-NEXT:    mov.b64 %rd4, grid_const_phi_ngc_param_0; | 
|  | ; PTX-NEXT:    ld.param.b64 %rd3, [grid_const_phi_ngc_param_2]; | 
|  | ; PTX-NEXT:    cvta.to.global.u64 %rd1, %rd3; | 
|  | ; PTX-NEXT:    ld.global.b32 %r1, [%rd1]; | 
|  | ; PTX-NEXT:    setp.lt.s32 %p1, %r1, 0; | 
|  | ; PTX-NEXT:    @%p1 bra $L__BB10_2; | 
|  | ; PTX-NEXT:  // %bb.1: // %second | 
|  | ; PTX-NEXT:    mov.b64 %rd2, grid_const_phi_ngc_param_1; | 
|  | ; PTX-NEXT:    add.s64 %rd4, %rd2, 4; | 
|  | ; PTX-NEXT:  $L__BB10_2: // %merge | 
|  | ; PTX-NEXT:    ld.param.b32 %r2, [%rd4]; | 
|  | ; PTX-NEXT:    st.global.b32 [%rd1], %r2; | 
|  | ; PTX-NEXT:    ret; | 
|  | ; OPT-LABEL: define ptx_kernel void @grid_const_phi_ngc( | 
|  | ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 "nvvm.grid_constant" [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { | 
|  | ; OPT-NEXT:    [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) | 
|  | ; OPT-NEXT:    [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr | 
|  | ; OPT-NEXT:    [[TMP2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) | 
|  | ; OPT-NEXT:    [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr | 
|  | ; OPT-NEXT:    [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4 | 
|  | ; OPT-NEXT:    [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 | 
|  | ; OPT-NEXT:    br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]] | 
|  | ; OPT:       [[FIRST]]: | 
|  | ; OPT-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0 | 
|  | ; OPT-NEXT:    br label %[[MERGE:.*]] | 
|  | ; OPT:       [[SECOND]]: | 
|  | ; OPT-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT2_PARAM_GEN]], i32 0, i32 1 | 
|  | ; OPT-NEXT:    br label %[[MERGE]] | 
|  | ; OPT:       [[MERGE]]: | 
|  | ; OPT-NEXT:    [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] | 
|  | ; OPT-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 | 
|  | ; OPT-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT]], align 4 | 
|  | ; OPT-NEXT:    ret void | 
|  | %val = load i32, ptr %inout | 
|  | %less = icmp slt i32 %val, 0 | 
|  | br i1 %less, label %first, label %second | 
|  | first: | 
|  | %ptr1 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 0 | 
|  | br label %merge | 
|  | second: | 
|  | %ptr2 = getelementptr inbounds %struct.s, ptr %input2, i32 0, i32 1 | 
|  | br label %merge | 
|  | merge: | 
|  | %ptrnew = phi ptr [%ptr1, %first], [%ptr2, %second] | 
|  | %valloaded = load i32, ptr %ptrnew | 
|  | store i32 %valloaded, ptr %inout | 
|  | ret void | 
|  | } | 
|  |  | 
|  | ; NOTE: %input2 is *not* grid_constant | 
|  | define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 "nvvm.grid_constant" %input1, ptr byval(i32) %input2, ptr %inout) { | 
|  | ; PTX-LABEL: grid_const_select( | 
|  | ; PTX:       { | 
|  | ; PTX-NEXT:    .reg .pred %p<2>; | 
|  | ; PTX-NEXT:    .reg .b32 %r<3>; | 
|  | ; PTX-NEXT:    .reg .b64 %rd<6>; | 
|  | ; PTX-EMPTY: | 
|  | ; PTX-NEXT:  // %bb.0: | 
|  | ; PTX-NEXT:    mov.b64 %rd1, grid_const_select_param_0; | 
|  | ; PTX-NEXT:    ld.param.b64 %rd2, [grid_const_select_param_2]; | 
|  | ; PTX-NEXT:    cvta.to.global.u64 %rd3, %rd2; | 
|  | ; PTX-NEXT:    mov.b64 %rd4, grid_const_select_param_1; | 
|  | ; PTX-NEXT:    ld.global.b32 %r1, [%rd3]; | 
|  | ; PTX-NEXT:    setp.lt.s32 %p1, %r1, 0; | 
|  | ; PTX-NEXT:    selp.b64 %rd5, %rd1, %rd4, %p1; | 
|  | ; PTX-NEXT:    ld.param.b32 %r2, [%rd5]; | 
|  | ; PTX-NEXT:    st.global.b32 [%rd3], %r2; | 
|  | ; PTX-NEXT:    ret; | 
|  | ; OPT-LABEL: define ptx_kernel void @grid_const_select( | 
|  | ; OPT-SAME: ptr byval(i32) align 4 "nvvm.grid_constant" [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { | 
|  | ; OPT-NEXT:    [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) | 
|  | ; OPT-NEXT:    [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr | 
|  | ; OPT-NEXT:    [[TMP2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) | 
|  | ; OPT-NEXT:    [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr | 
|  | ; OPT-NEXT:    [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4 | 
|  | ; OPT-NEXT:    [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 | 
|  | ; OPT-NEXT:    [[PTRNEW:%.*]] = select i1 [[LESS]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]] | 
|  | ; OPT-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 | 
|  | ; OPT-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT]], align 4 | 
|  | ; OPT-NEXT:    ret void | 
|  | %val = load i32, ptr %inout | 
|  | %less = icmp slt i32 %val, 0 | 
|  | %ptrnew = select i1 %less, ptr %input1, ptr %input2 | 
|  | %valloaded = load i32, ptr %ptrnew | 
|  | store i32 %valloaded, ptr %inout | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) "nvvm.grid_constant" %input) { | 
|  | ; PTX-LABEL: grid_const_ptrtoint( | 
|  | ; PTX:       { | 
|  | ; PTX-NEXT:    .reg .b32 %r<4>; | 
|  | ; PTX-NEXT:    .reg .b64 %rd<3>; | 
|  | ; PTX-EMPTY: | 
|  | ; PTX-NEXT:  // %bb.0: | 
|  | ; PTX-NEXT:    mov.b64 %rd1, grid_const_ptrtoint_param_0; | 
|  | ; PTX-NEXT:    ld.param.b32 %r1, [grid_const_ptrtoint_param_0]; | 
|  | ; PTX-NEXT:    cvta.param.u64 %rd2, %rd1; | 
|  | ; PTX-NEXT:    cvt.u32.u64 %r2, %rd2; | 
|  | ; PTX-NEXT:    add.s32 %r3, %r1, %r2; | 
|  | ; PTX-NEXT:    st.param.b32 [func_retval0], %r3; | 
|  | ; PTX-NEXT:    ret; | 
|  | ; OPT-LABEL: define ptx_kernel i32 @grid_const_ptrtoint( | 
|  | ; OPT-SAME: ptr byval(i32) align 4 "nvvm.grid_constant" [[INPUT:%.*]]) #[[ATTR0]] { | 
|  | ; OPT-NEXT:    [[INPUT2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) | 
|  | ; OPT-NEXT:    [[INPUT3:%.*]] = load i32, ptr addrspace(101) [[INPUT2]], align 4 | 
|  | ; OPT-NEXT:    [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[INPUT2]] to ptr | 
|  | ; OPT-NEXT:    [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32 | 
|  | ; OPT-NEXT:    [[KEEPALIVE:%.*]] = add i32 [[INPUT3]], [[PTRVAL]] | 
|  | ; OPT-NEXT:    ret i32 [[KEEPALIVE]] | 
|  | %val = load i32, ptr %input | 
|  | %ptrval = ptrtoint ptr %input to i32 | 
|  | %keepalive = add i32 %val, %ptrval | 
|  | ret i32 %keepalive | 
|  | } | 
|  |  | 
|  | declare void @device_func(ptr byval(i32) align 4) | 
|  |  | 
|  | define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 "nvvm.grid_constant" %input) { | 
|  | ; OPT-LABEL: define ptx_kernel void @test_forward_byval_arg( | 
|  | ; OPT-SAME: ptr byval(i32) align 4 "nvvm.grid_constant" [[INPUT:%.*]]) #[[ATTR0]] { | 
|  | ; OPT-NEXT:    [[INPUT_PARAM:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) | 
|  | ; OPT-NEXT:    [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[INPUT_PARAM]] to ptr | 
|  | ; OPT-NEXT:    call void @device_func(ptr byval(i32) align 4 [[INPUT_PARAM_GEN]]) | 
|  | ; OPT-NEXT:    ret void | 
|  | ; | 
|  | ; PTX-LABEL: test_forward_byval_arg( | 
|  | ; PTX:       { | 
|  | ; PTX-NEXT:    .reg .b32 %r<2>; | 
|  | ; PTX-EMPTY: | 
|  | ; PTX-NEXT:  // %bb.0: | 
|  | ; PTX-NEXT:    { // callseq 4, 0 | 
|  | ; PTX-NEXT:    .param .align 4 .b8 param0[4]; | 
|  | ; PTX-NEXT:    ld.param.b32 %r1, [test_forward_byval_arg_param_0]; | 
|  | ; PTX-NEXT:    st.param.b32 [param0], %r1; | 
|  | ; PTX-NEXT:    call.uni device_func, (param0); | 
|  | ; PTX-NEXT:    } // callseq 4 | 
|  | ; PTX-NEXT:    ret; | 
|  | call void @device_func(ptr byval(i32) align 4 %input) | 
|  | ret void | 
|  | } | 
|  |  | 
|  |  | 
|  | declare dso_local void @dummy() local_unnamed_addr | 
|  | declare dso_local ptr @escape(ptr) local_unnamed_addr | 
|  | declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr |