diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -140,6 +140,7 @@ // ============================================================================= // If the function had a byval struct ptr arg, say foo(%struct.x* byval %d), +// and we can't guarantee that the only accesses are loads, // then add the following instructions to the first basic block: // // %temp = alloca %struct.x, align 8 @@ -150,7 +151,57 @@ // The above code allocates some space in the stack and copies the incoming // struct from param space to local space. // Then replace all occurrences of %d by %temp. +// +// In case we know that all users are GEPs or Loads, replace them with the same +// ones in parameter AS, so we can access them using ld.param. // ============================================================================= + +// Replaces the \p OldUser instruction with the same in parameter AS. +// Only Load and GEP are supported. +static void convertToParamAS(Value *OldUser, Value *Param) { + Instruction *I = dyn_cast(OldUser); + assert(I && "OldUser must be an instruction"); + struct IP { + Instruction *OldInstruction; + Value *NewParam; + }; + SmallVector ItemsToConvert = {{I, Param}}; + SmallVector GEPsToDelete; + while (!ItemsToConvert.empty()) { + IP I = ItemsToConvert.pop_back_val(); + if (auto *LI = dyn_cast(I.OldInstruction)) + LI->setOperand(0, I.NewParam); + else if (auto *GEP = dyn_cast(I.OldInstruction)) { + SmallVector Indices(GEP->indices()); + auto *NewGEP = GetElementPtrInst::Create(nullptr, I.NewParam, Indices, + GEP->getName(), GEP); + NewGEP->setIsInBounds(GEP->isInBounds()); + llvm::for_each(GEP->users(), [NewGEP, &ItemsToConvert](Value *V) { + ItemsToConvert.push_back({cast(V), NewGEP}); + }); + GEPsToDelete.push_back(GEP); + } else + llvm_unreachable("Only Load and GEP can be converted to param AS."); + } + llvm::for_each(GEPsToDelete, + [](GetElementPtrInst *GEP) { GEP->eraseFromParent(); }); +} + +static bool isALoadChain(Value *Start) { + SmallVector ValuesToCheck = {Start}; + while (!ValuesToCheck.empty()) { + Value *V = ValuesToCheck.pop_back_val(); + Instruction *I = dyn_cast(V); + if (!I) + return false; + if (isa(I)) + ValuesToCheck.append(I->user_begin(), I->user_end()); + else if (!isa(I)) + return false; + } + return true; +}; + void NVPTXLowerArgs::handleByValParam(Argument *Arg) { Function *Func = Arg->getParent(); Instruction *FirstInst = &(Func->getEntryBlock().front()); @@ -159,6 +210,21 @@ assert(PType && "Expecting pointer type in handleByValParam"); Type *StructType = PType->getElementType(); + + if (llvm::all_of(Arg->users(), isALoadChain)) { + // Replace all loads with the loads in param AS. This allows loading the Arg + // directly from parameter AS, without making a temporary copy. + SmallVector UsersToUpdate(Arg->users()); + Value *ArgInParamAS = new AddrSpaceCastInst( + Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(), + FirstInst); + llvm::for_each(UsersToUpdate, [ArgInParamAS](Value *V) { + convertToParamAS(V, ArgInParamAS); + }); + return; + } + + // Otherwise we have to create a temporary copy. const DataLayout &DL = Func->getParent()->getDataLayout(); unsigned AS = DL.getAllocaAddrSpace(); AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst); diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll --- a/llvm/test/CodeGen/NVPTX/lower-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args.ll @@ -23,5 +23,12 @@ %arg.idx.val.val = load i32, i32* %arg.idx.val, align 4 %add.i = add nsw i32 %arg.idx.val.val, %arg.idx2.val store i32 %add.i, i32* %arg.idx1.val, align 4 + + ; let the pointer escape so we still create a local copy this test uses to + ; check the load alignment. + %tmp = call i32* @escape(i32* nonnull %arg.idx2) ret void } + +; Function Attrs: convergent nounwind +declare dso_local i32* @escape(i32*) local_unnamed_addr diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -0,0 +1,92 @@ +; RUN: llc < %s -mcpu=sm_20 | FileCheck %s + +target triple = "nvptx64-nvidia-cuda" + +%struct.ham = type { [4 x i32] } + +; // Verify that load with static offset into parameter is done directly. +; CHECK-LABEL: .visible .entry static_offset +; CHECK: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0] +; CHECK: mov.b64 %[[param_addr:rd[0-9]+]], {{.*}}_param_1 +; CHECK: mov.u64 %[[param_addr1:rd[0-9]+]], %[[param_addr]] +; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]] +; CHECK: ld.param.u32 [[value:%r[0-9]+]], [%[[param_addr1]]+12]; +; CHECK st.global.u32 [[[result_addr_g]]], [[value]]; +; Function Attrs: nofree norecurse nounwind willreturn mustprogress +define dso_local void @static_offset(i32* nocapture %arg, %struct.ham* nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 { +bb: + %tmp = icmp eq i32 %arg2, 3 + br i1 %tmp, label %bb3, label %bb6 + +bb3: ; preds = %bb + %tmp4 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 3 + %tmp5 = load i32, i32* %tmp4, align 4 + store i32 %tmp5, i32* %arg, align 4 + br label %bb6 + +bb6: ; preds = %bb3, %bb + ret void +} + +; // Verify that load with dynamic offset into parameter is also done directly. +; CHECK-LABEL: .visible .entry dynamic_offset +; CHECK: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0] +; CHECK: mov.b64 %[[param_addr:rd[0-9]+]], {{.*}}_param_1 +; CHECK: mov.u64 %[[param_addr1:rd[0-9]+]], %[[param_addr]] +; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]] +; CHECK: add.s64 %[[param_w_offset:rd[0-9]+]], %[[param_addr1]], +; CHECK: ld.param.u32 [[value:%r[0-9]+]], [%[[param_w_offset]]]; +; CHECK st.global.u32 [[[result_addr_g]]], [[value]]; + +; Function Attrs: nofree norecurse nounwind willreturn mustprogress +define dso_local void @dynamic_offset(i32* nocapture %arg, %struct.ham* nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 { +bb: + %tmp = sext i32 %arg2 to i64 + %tmp3 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 %tmp + %tmp4 = load i32, i32* %tmp3, align 4 + store i32 %tmp4, i32* %arg, align 4 + ret void +} + +; Verify that if the pointer escapes, then we do fall back onto using a temp copy. +; CHECK-LABEL: .visible .entry pointer_escapes +; CHECK: .local .align 8 .b8 __local_depot{{.*}} +; CHECK: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0] +; CHECK: add.u64 %[[copy_addr:rd[0-9]+]], %SPL, 0; +; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+12]; +; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+8]; +; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+4]; +; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1]; +; CHECK-DAG: st.local.u32 [%[[copy_addr]]+12], +; CHECK-DAG: st.local.u32 [%[[copy_addr]]+8], +; CHECK-DAG: st.local.u32 [%[[copy_addr]]+4], +; CHECK-DAG: st.local.u32 [%[[copy_addr]]], +; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]] +; CHECK: add.s64 %[[copy_w_offset:rd[0-9]+]], %[[copy_addr]], +; CHECK: ld.local.u32 [[value:%r[0-9]+]], [%[[copy_w_offset]]]; +; CHECK st.global.u32 [[[result_addr_g]]], [[value]]; + +; Function Attrs: convergent norecurse nounwind mustprogress +define dso_local void @pointer_escapes(i32* nocapture %arg, %struct.ham* byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #1 { +bb: + %tmp = sext i32 %arg2 to i64 + %tmp3 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 %tmp + %tmp4 = load i32, i32* %tmp3, align 4 + store i32 %tmp4, i32* %arg, align 4 + %tmp5 = call i32* @escape(i32* nonnull %tmp3) #3 + ret void +} + +; Function Attrs: convergent nounwind +declare dso_local i32* @escape(i32*) local_unnamed_addr + + +!llvm.module.flags = !{!0, !1, !2} +!nvvm.annotations = !{!3, !4, !5} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 9, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!3 = !{void (i32*, %struct.ham*, i32)* @static_offset, !"kernel", i32 1} +!4 = !{void (i32*, %struct.ham*, i32)* @dynamic_offset, !"kernel", i32 1} +!5 = !{void (i32*, %struct.ham*, i32)* @pointer_escapes, !"kernel", i32 1} diff --git a/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll b/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll --- a/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll +++ b/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll @@ -35,7 +35,7 @@ ; CHECK: ld.param.u64 %[[iptr:rd.*]], [ptr_in_byval_kernel_param_0+8] ; CHECK: cvta.to.global.u64 %[[iptr_g:.*]], %[[iptr]]; %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1 - %b = load i32*, i32** %b_ptr, align 4 + %b = load i32*, i32** %b_ptr, align 8 %v = load i32, i32* %b, align 4 ; CHECK: ld.global.u32 %[[val:.*]], [%[[iptr_g]]] store i32 %v, i32* %output, align 4 @@ -51,7 +51,7 @@ ; CHECK: ld.param.u64 %[[optr:rd.*]], [ptr_in_byval_func_param_1] ; CHECK: ld.param.u64 %[[iptr:rd.*]], [ptr_in_byval_func_param_0+8] %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1 - %b = load i32*, i32** %b_ptr, align 4 + %b = load i32*, i32** %b_ptr, align 8 %v = load i32, i32* %b, align 4 ; CHECK: ld.u32 %[[val:.*]], [%[[iptr]]] store i32 %v, i32* %output, align 4