Index: lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp =================================================================== --- lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "NVPTXISelDAGToDAG.h" +#include "NVPTXUtilities.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Instructions.h" @@ -546,18 +547,36 @@ } static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, - unsigned codeAddrSpace, const DataLayout &DL) { - if (!Subtarget.hasLDG() || codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL) { + unsigned CodeAddrSpace, MachineFunction *F) { + // To use non-coherent caching, the load has to be from global + // memory and we have to prove that the memory area is not written + // to anywhere for the duration of the kernel call, not even after + // the load. + // + // To ensure that there are no writes to the memory, we require the + // underlying pointer to be a noalias (__restrict) kernel parameter + // that is never used for a write. We can only do this for kernel + // functions since from within a device function, we cannot know if + // there were or will be writes to the memory from the caller - or we + // could, but then we would have to do inter-procedural analysis. + if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL || + !isKernelFunction(*F->getFunction())) { return false; } - // Check whether load operates on a readonly argument. - bool canUseLDG = false; - if (const Argument *A = dyn_cast( - GetUnderlyingObject(N->getMemOperand()->getValue(), DL))) - canUseLDG = A->onlyReadsMemory() && A->hasNoAliasAttr(); + // We use GetUnderlyingObjects() here instead of + // GetUnderlyingObject() mainly because the former looks through phi + // nodes while the latter does not. We need to look through phi + // nodes to handle pointer induction variables. + SmallVector Objs; + GetUnderlyingObjects(const_cast(N->getMemOperand()->getValue()), + Objs, F->getDataLayout()); + for (Value *Obj : Objs) { + auto *A = dyn_cast(Obj); + if (!A || !A->onlyReadsMemory() || !A->hasNoAliasAttr()) return false; + } - return canUseLDG; + return true; } SDNode *NVPTXDAGToDAGISel::SelectIntrinsicNoChain(SDNode *N) { @@ -654,7 +673,7 @@ // Address Space Setting unsigned int codeAddrSpace = getCodeAddrSpace(LD); - if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, CurDAG->getDataLayout())) { + if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, MF)) { return SelectLDGLDU(N); } @@ -892,7 +911,7 @@ // Address Space Setting unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD); - if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, CurDAG->getDataLayout())) { + if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) { return SelectLDGLDU(N); } Index: test/CodeGen/NVPTX/load-with-non-coherent-cache.ll =================================================================== --- test/CodeGen/NVPTX/load-with-non-coherent-cache.ll +++ test/CodeGen/NVPTX/load-with-non-coherent-cache.ll @@ -189,7 +189,60 @@ ret void } -!nvvm.annotations = !{!1 ,!2 ,!3 ,!4 ,!5 ,!6, !7 ,!8 ,!9 ,!10 ,!11 ,!12, !13, !14, !15, !16, !17, !18} +; Test that we can infer a cached load for a pointer induction variable. +; SM20-LABEL: .visible .entry foo19( +; SM20: ld.global.f32 +; SM35-LABEL: .visible .entry foo19( +; SM35: ld.global.nc.f32 +define void @foo19(float * noalias readonly %from, float * %to, i32 %n) { +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %nexti, %loop ] + %sum = phi float [ 0.0, %entry ], [ %nextsum, %loop ] + %ptr = getelementptr inbounds float, float * %from, i32 %i + %value = load float, float * %ptr, align 4 + %nextsum = fadd float %value, %sum + %nexti = add nsw i32 %i, 1 + %exitcond = icmp eq i32 %nexti, %n + br i1 %exitcond, label %exit, label %loop + +exit: + store float %nextsum, float * %to + ret void +} + +; This test captures the case of a non-kernel function. In a +; non-kernel function, without interprocedural analysis, we do not +; know that the parameter is global. We also do not know that the +; pointed-to memory is never written to (for the duration of the +; kernel). For both reasons, we cannot use a cached load here. +; SM20-LABEL: notkernel( +; SM20: ld.f32 +; SM35-LABEL: notkernel( +; SM35: ld.f32 +define void @notkernel(float * noalias readonly %from, float * %to) { + %1 = load float, float * %from + store float %1, float * %to + ret void +} + +; As @notkernel, but with the parameter explicitly marked as global. We still +; do not know that the parameter is never written to (for the duration of the +; kernel). This case does not currently come up normally since we do not infer +; that pointers are global interprocedurally as of 2015-08-05. +; SM20-LABEL: notkernel2( +; SM20: ld.global.f32 +; SM35-LABEL: notkernel2( +; SM35: ld.global.f32 +define void @notkernel2(float addrspace(1) * noalias readonly %from, float * %to) { + %1 = load float, float addrspace(1) * %from + store float %1, float * %to + ret void +} + +!nvvm.annotations = !{!1 ,!2 ,!3 ,!4 ,!5 ,!6, !7 ,!8 ,!9 ,!10 ,!11 ,!12, !13, !14, !15, !16, !17, !18, !19} !1 = !{void (float *, float *)* @foo1, !"kernel", i32 1} !2 = !{void (double *, double *)* @foo2, !"kernel", i32 1} !3 = !{void (i16 *, i16 *)* @foo3, !"kernel", i32 1} @@ -208,3 +261,4 @@ !16 = !{void (<4 x float> *, <4 x float> *)* @foo16, !"kernel", i32 1} !17 = !{void (<4 x double> *, <4 x double> *)* @foo17, !"kernel", i32 1} !18 = !{void (float **, float **)* @foo18, !"kernel", i32 1} +!19 = !{void (float *, float *, i32)* @foo19, !"kernel", i32 1}