Index: llvm/lib/Target/NVPTX/CMakeLists.txt =================================================================== --- llvm/lib/Target/NVPTX/CMakeLists.txt +++ llvm/lib/Target/NVPTX/CMakeLists.txt @@ -15,6 +15,7 @@ NVPTXFavorNonGenericAddrSpaces.cpp NVPTXFrameLowering.cpp NVPTXGenericToNVVM.cpp + NVPTXHoistAddrSpaceCast.cpp NVPTXISelDAGToDAG.cpp NVPTXISelLowering.cpp NVPTXImageOptimizer.cpp Index: llvm/lib/Target/NVPTX/NVPTX.h =================================================================== --- llvm/lib/Target/NVPTX/NVPTX.h +++ llvm/lib/Target/NVPTX/NVPTX.h @@ -45,6 +45,7 @@ llvm::CodeGenOpt::Level OptLevel); ModulePass *createNVPTXAssignValidGlobalNamesPass(); ModulePass *createGenericToNVVMPass(); +FunctionPass *createNVPTXHoistAddrSpaceCastPass(); FunctionPass *createNVPTXFavorNonGenericAddrSpacesPass(); FunctionPass *createNVPTXInferAddressSpacesPass(); FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion); Index: llvm/lib/Target/NVPTX/NVPTXHoistAddrSpaceCast.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/NVPTX/NVPTXHoistAddrSpaceCast.cpp @@ -0,0 +1,231 @@ +//===-- NVPTXHoistAddrSpaceCast.cpp - Pass to hoist addrspacecasts --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Hoists "narrowing" addrspacecast instructions across known-commutative +// instructions. +// +// The motivation for this pass is the CUDA __ldg builtin, which loads from a +// pointer p using the hardware's texture cache, which is often faster than the +// regular load path. When the user loads from p using __ldg, they are +// asserting two things: +// +// 1) The memory pointed to by p is unchanging for the lifetime of the kernel +// (i.e., the load is invariant), and +// +// 2) p is in the global address space (i.e., addrspace(1)). +// +// Conversely, any load which meets these two conditions can safely be lowered +// as a load that uses the texture cache (NVPTX's ld.global.nc instruction). +// +// Suppose we have CUDA code that does +// +// int *p = ...; +// int v1 = __ldg(p); +// int v2 = __ldg(p + 1); +// +// The front-end can model this as +// +// i32* %p +// %q0 = addrspacecast i32* %p to i32 addrspace(1)* +// %v1 = load i32, i32 addrspace(1)* %q0, !invariant.load !0 +// +// %p1 = getelementptr i32, i32* %p, i32 1 +// %q1 = addrspacecast i32* %p1 to i32 addrspace(1)* +// %v2 = load i32, i32 addrspace(1)* %q1, !invariant.load !0 +// +// Because the two loads are invariant and from the global address space, LLVM +// can lower them as texture cache loads. +// +// Unfortunately this form obfuscates the fact that %q0 and %q1 are adjacent +// addresses, because llvm treats addrspacecast as an arbitrary transformation. +// But we want the optimizer to understand that %q0 and %q1 are adjacent so that +// e.g. the LoadStoreVectorizer can load both %v1 and %v2 with a single wide +// load. +// +// We are saved by the fact that, on NVPTX, addrspacecast is not completely a +// black box. In particular, addrspacecast is commutative with getelementptr. +// We can thus reorder the instructions in the second half of the code above so +// the cast comes first: +// +// %q1' = addrspacecast i32* %p to i32 addrspace(1)* +// %p1' = getelementptr i32, i32 addrspace(1)*, i32 1 +// %v2 = load i32, i32 addrspace(1)* %p1' +// +// Now the two addrspacecasts are identical and can be combined (e.g. with CSE). +// After this, the LoadStoreVectorizer can see that %v1 and %v2 load from +// adjacent addresses. +// +// +// This pass implements only half of the transformation above. Specifically, we +// move the cast above the GEP, but we then insert a second cast below the GEP +// casting back to the generic address space: +// +// %q1' = addrspacecast i32* %p to i32 addrspace(1)* +// %p1' = getelementptr i32, i32 addrspace(1)*, i32 1 +// %r1 = addrspacecast i32* %p1' to i32* +// %v2 = load i32, i32* %r1 +// +// We leave it up to the NVPTXInferAddressSpace pass to eliminate the second +// addrspacecast and modify the address space of the load. +// +// This pass only hoists "narrowing" addrspacecast instructions -- that is, +// casts from the generic to a non-generic addrspace. We want to operate in the +// generic address space as little as possible, because generic loads and stores +// are more expensive than non-generic loads/stores, so hoisting these would be +// counterproductive. +// +// You can think of hoisting an addrspacecast as propagating up the CFG an +// assertion that a particular pointer lives in a particular address space. +// It follows that we can only hoist an addrspacecast above an instruction if +// the cast postdominates the instruction -- otherwise, our assertion might be +// unfounded! +// +// TODO: Currently we only hoist through GEPs, but we could hoist through +// bitcast and maybe other instructions as well. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "nvptx-hoist-addrspace-cast" + +#include "NVPTX.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" + +using namespace llvm; + +namespace { + +class NVPTXHoistAddrSpaceCast : public FunctionPass { +public: + static char ID; + + NVPTXHoistAddrSpaceCast() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; + + PostDominatorTree *PDT; +}; + +} // anonymous namespace + +char NVPTXHoistAddrSpaceCast::ID = 0; + +namespace llvm { +void initializeNVPTXHoistAddrSpaceCastPass(PassRegistry &); +} // namespace llvm + +INITIALIZE_PASS_BEGIN(NVPTXHoistAddrSpaceCast, "nvptx-hoist-addrspace-cast", + "NVPTX hoist addrspacecast instrs", + /* CFGOnly = */ false, /* IsAnalysis = */ false) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) +INITIALIZE_PASS_END(NVPTXHoistAddrSpaceCast, "nvptx-hoist-addrspace-cast", + "NVPTX hoist addrspacecast instrs", false, false) + +void NVPTXHoistAddrSpaceCast::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired(); + FunctionPass::getAnalysisUsage(AU); +} + +bool NVPTXHoistAddrSpaceCast::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + PDT = &getAnalysis().getPostDomTree(); + + SmallVector Worklist; + + // Find all the addrspacecast instructions that + // + // - are "narrowing" (i.e., convert from the generic addrspace to a + // non-generic space), and + // + // - have src and dst types that are the same except for their address spaces + // (i.e, we ignore addrspacecasts that also do a pointer-type bitcast). + for (Instruction &I : instructions(F)) + if (AddrSpaceCastInst *ASC = dyn_cast(&I)) + if (ASC->getSrcAddressSpace() == AddressSpace::ADDRESS_SPACE_GENERIC && + ASC->getDestAddressSpace() != AddressSpace::ADDRESS_SPACE_GENERIC && + ASC->getSrcTy()->getPointerElementType() == + ASC->getDestTy()->getPointerElementType()) + Worklist.push_back(ASC); + + bool Changed = false; + SmallVector ToRemove; + while (!Worklist.empty()) { + AddrSpaceCastInst *ASC = Worklist.pop_back_val(); + + // We currently only hoist addrspacecasts above GEPs. + GetElementPtrInst *GEP = dyn_cast(ASC->getOperand(0)); + if (!GEP) + continue; + + // We can hoist iff ASC post-dominates GEP. + if (ASC->getParent() != GEP->getParent() && + !PDT->dominates(ASC->getParent(), GEP->getParent())) { + DEBUG(dbgs() << "[Hoist ASC] Can't hoist " << *ASC << " above " << *GEP + << " because the cast does not postdominate the GEP.\n"); + continue; + } + + DEBUG(dbgs() << "[Hoist ASC] Hoisting " << *ASC << " above " << *GEP + << "\n"); + + // We transform + // + // %a = getelementptr T* %p + // [...] + // %b = addrspacecast T* %a to addrspace(N) + // + // to + // + // %q = addrspacecast T* %p to addrspace(N) + // %b' = getelementptr T* addrspace(N) %q + // %a' = addrspacecast T* addrspace(N) %y to addrspace(0) + // [...] + // + // Users of the original cast use the value produced by the new GEP, and + // users of the original GEP use the value produced by the second new cast. + // NVPTX address space inference will clean this up, eliminating the second + // cast. + + Value *GEPOp = GEP->getOperand(0); + AddrSpaceCastInst *NewASC = + new AddrSpaceCastInst(GEPOp, ASC->getType(), ASC->getName(), GEP); + GetElementPtrInst *NewGEP = GetElementPtrInst::Create( + GEP->getSourceElementType(), NewASC, + SmallVector(GEP->idx_begin(), GEP->idx_end()), + GEP->getName(), GEP); + NewGEP->setIsInBounds(GEP->isInBounds()); + + AddrSpaceCastInst *NewASCToGeneric = + new AddrSpaceCastInst(NewGEP, GEP->getType(), ASC->getName(), GEP); + + ASC->replaceAllUsesWith(NewGEP); + GEP->replaceAllUsesWith(NewASCToGeneric); + ToRemove.push_back(ASC); + ToRemove.push_back(GEP); + + Worklist.push_back(NewASC); + + Changed = true; + } + + for (Instruction *I : ToRemove) + I->eraseFromParent(); + + return Changed; +} + +FunctionPass *llvm::createNVPTXHoistAddrSpaceCastPass() { + return new NVPTXHoistAddrSpaceCast(); +} Index: llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -62,12 +62,19 @@ cl::desc("Disable load/store vectorizer"), cl::init(false), cl::Hidden); +// Switch to disable the nvptx-hoist-addrspace-cast pass in case of bugs. +static cl::opt + DisableHoistAddrSpaceCast("disable-nvptx-hoist-addrspace-cast", + cl::desc("Disable addrspacecast hoisting"), + cl::init(false), cl::Hidden); + namespace llvm { void initializeNVVMIntrRangePass(PassRegistry&); void initializeNVVMReflectPass(PassRegistry&); void initializeGenericToNVVMPass(PassRegistry&); void initializeNVPTXAllocaHoistingPass(PassRegistry &); void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&); +void initializeNVPTXHoistAddrSpaceCastPass(PassRegistry&); void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &); void initializeNVPTXInferAddressSpacesPass(PassRegistry &); void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); @@ -88,6 +95,7 @@ initializeGenericToNVVMPass(PR); initializeNVPTXAllocaHoistingPass(PR); initializeNVPTXAssignValidGlobalNamesPass(PR); + initializeNVPTXHoistAddrSpaceCastPass(PR); initializeNVPTXFavorNonGenericAddrSpacesPass(PR); initializeNVPTXInferAddressSpacesPass(PR); initializeNVPTXLowerArgsPass(PR); @@ -261,6 +269,13 @@ addPass(createNVPTXAssignValidGlobalNamesPass()); addPass(createGenericToNVVMPass()); + if (getOptLevel() != CodeGenOpt::None && !DisableHoistAddrSpaceCast) { + addPass(createNVPTXHoistAddrSpaceCastPass()); + // Address space hoisting may create duplicate casts that need to be cleaned + // up, so run CSE. + addPass(createEarlyCSEPass()); + } + // NVPTXLowerArgs is required for correctness and should be run right // before the address space inference passes. addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine())); Index: llvm/test/CodeGen/NVPTX/hoist-addrspace-cast-e2e.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/NVPTX/hoist-addrspace-cast-e2e.ll @@ -0,0 +1,52 @@ +; Check that hoist-addspace-cast plays well with addrspace inference and +; load/store vectorizer, and that we can vectorize and lower the idiom clang +; uses for __ldg to ld.global.nc. + +; RUN: opt -mtriple=nvptx64-nvidia-cuda -nvptx-hoist-addrspace-cast \ +; RUN: -nvptx-infer-addrspace -early-cse -load-store-vectorizer -S < %s | FileCheck %s --check-prefix IR + +; RUN: llc -mtriple=nvptx64-nvidia-cuda -O3 < %s | FileCheck %s --check-prefix PTX + +; RUN: llc -mtriple=nvptx64-nvidia-cuda -O3 -disable-nvptx-hoist-addrspace-cast < %s \ +; RUN: | FileCheck %s --check-prefix PTX-NO-HOIST + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; IR-LABEL: @easy +define i32 @easy(i32* %p) { + ; IR: [[ASC:%[^ ]+]] = addrspacecast i32* %p to i32 addrspace(1)* + ; IR: [[GEP:%[^ ]+]] = getelementptr i32, i32 addrspace(1)* [[ASC]], i32 42 + ; IR: [[RET:%[^ ]+]] = load i32, i32 addrspace(1)* [[GEP]] + ; IR: ret i32 [[RET]] + %gep = getelementptr i32, i32* %p, i32 42 + %cast = addrspacecast i32* %gep to i32 addrspace(1)* + %ret = load i32, i32* %gep + ret i32 %ret +} + +; IR-LABEL: @ldg_idiom +define i32 @ldg_idiom(i32* %ptr) { + ; IR: [[ASC:%[^ ]+]] = addrspacecast i32* %ptr to i32 addrspace(1)* + ; IR: [[BC:%[^ ]+]] = bitcast i32 addrspace(1)* [[ASC]] to <2 x i32> addrspace(1) + ; IR: [[VLOAD:%[^ ]+]] = load <2 x i32>, <2 x i32> addrspace(1)* [[BC]], align 8, !invariant.load !0 + ; IR: [[LOAD0:%[^ ]+]] = extractelement <2 x i32> [[VLOAD]], i32 0 + ; IR: [[LOAD1:%[^ ]+]] = extractelement <2 x i32> [[VLOAD]], i32 1 + ; IR: add i32 [[LOAD0]], [[LOAD1]] + + ; PTX: cvta.to.global + ; PTX: ld.global.v2.{{.}}32 + + ; PTX-NO-HOIST-NOT: ld.global.v2.{{.}}32 + + %ptr1 = getelementptr i32, i32* %ptr, i32 1 + %p0 = addrspacecast i32* %ptr to i32 addrspace(1)* + %p1 = addrspacecast i32* %ptr1 to i32 addrspace(1)* + %v0 = load i32, i32 addrspace(1)* %p0, align 8, !invariant.load !0 + %v1 = load i32, i32 addrspace(1)* %p1, align 4, !invariant.load !0 + %sum = add i32 %v0, %v1 + ret i32 %sum +} + +!0 = !{} + Index: llvm/test/CodeGen/NVPTX/hoist-addrspace-cast.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/NVPTX/hoist-addrspace-cast.ll @@ -0,0 +1,59 @@ +; RUN: opt -mtriple=nvptx64-nvidia-cuda -nvptx-hoist-addrspace-cast -S < %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; CHECK-LABEL: @easy +define i32* @easy(i32* %p) { + ; CHECK: [[ASC:%[^ ]+]] = addrspacecast i32* %p to i32 addrspace(1)* + ; CHECK: [[GEP:%[^ ]+]] = getelementptr i32, i32 addrspace(1)* [[ASC]], i32 42 + ; CHECK: [[RET:%[^ ]+]] = addrspacecast i32 addrspace(1)* [[GEP]] to i32* + ; CHECK: ret i32* [[RET]] + %gep = getelementptr i32, i32* %p, i32 42 + %cast = addrspacecast i32* %gep to i32 addrspace(1)* + ret i32* %gep +} + +; CHECK-LABEL: @no_hoist_through_non_gep +; No changes; we don't hoist through inttoptr/ptrtoint. +define void @no_hoist_through_non_gep(i32* %p) { + ; CHECK: getelementptr + ; CHECK: ptrtoint + ; CHECK: inttoptr + ; CHECK: addrspacecast + %gep = getelementptr i32, i32* %p, i32 42 + %i = ptrtoint i32* %gep to i64 + %q = inttoptr i64 %i to i32* + %cast = addrspacecast i32* %q to i32 addrspace(1)* + ret void +} + +; CHECK-LABEL: @multiple_geps +define void @multiple_geps(i32* %p) { + ; CHECK: [[CAST1:%[^ ]+]] = addrspacecast i32* %p to i32 addrspace(1)* + ; CHECK: [[GEP1:%[^ ]+]] = getelementptr i32, i32 addrspace(1)* [[CAST1]], i32 1 + ; CHECK: addrspacecast i32 addrspace(1)* [[GEP1]] to i32* + ; CHECK: [[GEP2:[^ ]+]] = getelementptr i32, i32 addrspace(1)* [[GEP1]] + ; CHECK: addrspacecast i32 addrspace(1)* [[GEP2]] to i32* + %gep1 = getelementptr i32, i32* %p, i32 1 + %gep2 = getelementptr i32, i32* %gep1, i32 2 + %cast = addrspacecast i32* %gep2 to i32 addrspace(1)* + ret void +} + +; CHECK-LABEL: @cast_ptr_type +; +; Currently we don't transform this because the addrspacecast's src and dst +; types differ by more than just their address spaces. That's OK, just make +; sure we don't crash. +define i64 addrspace(1)* @cast_ptr_type(i32* %p) { + ; CHECK: getelementptr + ; CHECK: addrspacecast + %gep = getelementptr i32, i32* %p, i32 42 + %cast = addrspacecast i32* %gep to i64 addrspace(1)* + ret i64 addrspace(1)* %cast +} + +; TODO: Write e2e test checking that address space inference DTRT. +; TODO: Write moar e2e test checking that LSV plus addrspace inference plus this DTRT. +