diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -140,6 +140,7 @@ // ============================================================================= // If the function had a byval struct ptr arg, say foo(%struct.x* byval %d), +// and we can't guarantee that the only accesses are loads, // then add the following instructions to the first basic block: // // %temp = alloca %struct.x, align 8 @@ -150,7 +151,47 @@ // The above code allocates some space in the stack and copies the incoming // struct from param space to local space. // Then replace all occurrences of %d by %temp. +// +// In case we know that all users are GEPs or Loads, replace them with the same +// ones in parameter AS, so we can access them using ld.param. // ============================================================================= + +// Replaces the \p OldUser instruction with the same in parameter AS. +// Only Load and GEP are supported. +static void ConvertToParamAS(Value *OldUser, Value *Param) { + Instruction *I = dyn_cast(OldUser); + assert(I && "OldUser must be an instruction"); + if (auto *LI = dyn_cast(I)) { + auto *NewLI = new LoadInst(LI->getType(), Param, LI->getName(), + LI->isVolatile(), LI->getAlign(), I); + LI->replaceAllUsesWith(NewLI); + LI->eraseFromParent(); + return; + } + if (auto *GEP = dyn_cast(I)) { + SmallVector Indices(GEP->indices()); + auto *NewGEP = + GetElementPtrInst::Create(nullptr, Param, Indices, GEP->getName(), GEP); + NewGEP->setIsInBounds(GEP->isInBounds()); + + SmallVector UsersToUpdate(GEP->users()); + llvm::for_each(UsersToUpdate, + [NewGEP](Value *V) { ConvertToParamAS(V, NewGEP); }); + GEP->eraseFromParent(); + return; + } + llvm_unreachable("Only Load and GEP can be converted to param AS."); +} + +static bool isALoadChain(Value *V) { + if (Instruction *I = dyn_cast(V)) { + if (isa(I)) + return llvm::all_of(I->users(), isALoadChain); + return isa(I); + } + return false; +}; + void NVPTXLowerArgs::handleByValParam(Argument *Arg) { Function *Func = Arg->getParent(); Instruction *FirstInst = &(Func->getEntryBlock().front()); @@ -159,6 +200,21 @@ assert(PType && "Expecting pointer type in handleByValParam"); Type *StructType = PType->getElementType(); + + if (llvm::all_of(Arg->users(), isALoadChain)) { + // Replace all loads with the loads in param AS which allows loading the Arg + // directly from parameter AS, without making a temporary copy. + SmallVector UsersToUpdate(Arg->users()); + Value *ArgInParamAS = new AddrSpaceCastInst( + Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(), + FirstInst); + llvm::for_each(UsersToUpdate, [ArgInParamAS](Value *V) { + ConvertToParamAS(V, ArgInParamAS); + }); + return; + } + + // Otherwise we have to create a temporary copy. const DataLayout &DL = Func->getParent()->getDataLayout(); unsigned AS = DL.getAllocaAddrSpace(); AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst); diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -0,0 +1,92 @@ +; RUN: llc < %s -mcpu=sm_20 | FileCheck %s + +target triple = "nvptx64-nvidia-cuda" + +%struct.ham = type { [4 x i32] } + +; // Verify that load with static offset into parameter is done directly. +; CHECK-LABEL: .visible .entry static_offset +; CHECK: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0] +; CHECK: mov.b64 %[[param_addr:rd[0-9]+]], {{.*}}_param_1 +; CHECK: mov.u64 %[[param_addr1:rd[0-9]+]], %[[param_addr]] +; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]] +; CHECK: ld.param.u32 [[value:%r[0-9]+]], [%[[param_addr1]]+12]; +; CHECK st.global.u32 [[[result_addr_g]]], [[value]]; +; Function Attrs: nofree norecurse nounwind willreturn mustprogress +define dso_local void @static_offset(i32* nocapture %arg, %struct.ham* nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 { +bb: + %tmp = icmp eq i32 %arg2, 3 + br i1 %tmp, label %bb3, label %bb6 + +bb3: ; preds = %bb + %tmp4 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 3 + %tmp5 = load i32, i32* %tmp4, align 4 + store i32 %tmp5, i32* %arg, align 4 + br label %bb6 + +bb6: ; preds = %bb3, %bb + ret void +} + +; // Verify that load with dynamic offset into parameter is also done directly. +; CHECK-LABEL: .visible .entry dynamic_offset +; CHECK: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0] +; CHECK: mov.b64 %[[param_addr:rd[0-9]+]], {{.*}}_param_1 +; CHECK: mov.u64 %[[param_addr1:rd[0-9]+]], %[[param_addr]] +; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]] +; CHECK: add.s64 %[[param_w_offset:rd[0-9]+]], %[[param_addr1]], +; CHECK: ld.param.u32 [[value:%r[0-9]+]], [%[[param_w_offset]]]; +; CHECK st.global.u32 [[[result_addr_g]]], [[value]]; + +; Function Attrs: nofree norecurse nounwind willreturn mustprogress +define dso_local void @dynamic_offset(i32* nocapture %arg, %struct.ham* nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 { +bb: + %tmp = sext i32 %arg2 to i64 + %tmp3 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 %tmp + %tmp4 = load i32, i32* %tmp3, align 4 + store i32 %tmp4, i32* %arg, align 4 + ret void +} + +; Verify that if the pointer escapes, then we do fall back onto using a temp copy. +; CHECK-LABEL: .visible .entry pointer_escapes +; CHECK: .local .align 8 .b8 __local_depot{{.*}} +; CHECK: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0] +; CHECK: add.u64 %[[copy_addr:rd[0-9]+]], %SPL, 0; +; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+12]; +; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+8]; +; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+4]; +; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1]; +; CHECK-DAG: st.local.u32 [%[[copy_addr]]+12], +; CHECK-DAG: st.local.u32 [%[[copy_addr]]+8], +; CHECK-DAG: st.local.u32 [%[[copy_addr]]+4], +; CHECK-DAG: st.local.u32 [%[[copy_addr]]], +; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]] +; CHECK: add.s64 %[[copy_w_offset:rd[0-9]+]], %[[copy_addr]], +; CHECK: ld.local.u32 [[value:%r[0-9]+]], [%[[copy_w_offset]]]; +; CHECK st.global.u32 [[[result_addr_g]]], [[value]]; + +; Function Attrs: convergent norecurse nounwind mustprogress +define dso_local void @pointer_escapes(i32* nocapture %arg, %struct.ham* byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #1 { +bb: + %tmp = sext i32 %arg2 to i64 + %tmp3 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 %tmp + %tmp4 = load i32, i32* %tmp3, align 4 + store i32 %tmp4, i32* %arg, align 4 + %tmp5 = call i32* @escape(i32* nonnull %tmp3) #3 + ret void +} + +; Function Attrs: convergent nounwind +declare dso_local i32* @escape(i32*) local_unnamed_addr + + +!llvm.module.flags = !{!0, !1, !2} +!nvvm.annotations = !{!3, !4, !5} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 9, i32 1]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!3 = !{void (i32*, %struct.ham*, i32)* @static_offset, !"kernel", i32 1} +!4 = !{void (i32*, %struct.ham*, i32)* @dynamic_offset, !"kernel", i32 1} +!5 = !{void (i32*, %struct.ham*, i32)* @pointer_escapes, !"kernel", i32 1}