Index: lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp =================================================================== --- lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp +++ lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp @@ -10,34 +10,54 @@ // When a load/store accesses the generic address space, checks whether the // address is casted from a non-generic address space. If so, remove this // addrspacecast because accessing non-generic address spaces is typically -// faster. Besides seeking addrspacecasts, this optimization also traces into -// the base pointer of a GEP. +// faster. Besides removing addrspacecasts directly used by loads/stores, this +// optimization also recursively traces into a GEP's pointer operand and a +// bitcast's source to find more eliminable addrspacecasts. // // For instance, the code below loads a float from an array allocated in // addrspace(3). // -// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]* -// %1 = gep [10 x float]* %0, i64 0, i64 %i -// %2 = load float* %1 ; emits ld.f32 +// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]* +// %1 = gep [10 x float]* %0, i64 0, i64 %i +// %2 = bitcast float* %1 to i32* +// %3 = load i32* %2 ; emits ld.u32 // -// First, function hoistAddrSpaceCastFromGEP reorders the addrspacecast -// and the GEP to expose more optimization opportunities to function +// First, function hoistAddrSpaceCastFrom reorders the addrspacecast, the GEP, +// and the bitcast to expose more optimization opportunities to function // optimizeMemoryInst. The intermediate code looks like: // -// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i -// %1 = addrspacecast float addrspace(3)* %0 to float* -// %2 = load float* %1 ; still emits ld.f32, but will be optimized shortly +// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i +// %1 = bitcast float addrspace(3)* %0 to i32 addrspace(3)* +// %2 = addrspacecast i32 addrspace(3)* %1 to i32* +// %3 = load i32* %2 ; still emits ld.f32, but will be optimized shortly // // Then, function optimizeMemoryInstruction detects a load from addrspacecast'ed // generic pointers, and folds the load and the addrspacecast into a load from // the original address space. The final code looks like: // -// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i -// %2 = load float addrspace(3)* %0 ; emits ld.shared.f32 +// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i +// %1 = bitcast float addrspace(3)* %0 to i32 addrspace(3)* +// %3 = load i32 addrspace(3)* %1 ; emits ld.shared.f32 // // This pass may remove an addrspacecast in a different BB. Therefore, we // implement it as a FunctionPass. // +// TODO: +// The current implementation doesn't handle PHINodes. Eliminating +// addrspacecasts used by PHINodes is trickier because PHINodes can introduce +// loops in data flow. For example, +// +// %generic.input = addrspacecast float addrspace(3)* %input to float* +// loop: +// %y = phi [ %generic.input, %y2 ] +// %y2 = getelementptr %y, 1 +// %v = load %y2 +// br ..., label %loop, ... +// +// Marking %y2 shared depends on marking %y shared, but %y also data-flow +// depends on %y2. We probably need an iterative fix-point algorithm on handle +// this case. +// //===----------------------------------------------------------------------===// #include "NVPTX.h" @@ -62,17 +82,31 @@ public: static char ID; NVPTXFavorNonGenericAddrSpaces() : FunctionPass(ID) {} - bool runOnFunction(Function &F) override; +private: /// Optimizes load/store instructions. Idx is the index of the pointer operand /// (0 for load, and 1 for store). Returns true if it changes anything. bool optimizeMemoryInstruction(Instruction *I, unsigned Idx); + /// Recursively traces into a GEP's pointer operand or a bitcast's source to + /// find an eliminable addrspacecast, and hoists that addrspacecast to the + /// outermost level. For example, this function transforms + /// bitcast(gep(gep(addrspacecast(X)))) + /// to + /// addrspacecast(bitcast(gep(gep(X)))). + /// + /// This reordering exposes to optimizeMemoryInstruction more + /// optimization opportunities on loads and stores. + /// + /// Returns true if this function succesfully hoists an eliminable + /// addrspacecast or V is already such an addrspacecast. /// Transforms "gep (addrspacecast X), indices" into "addrspacecast (gep X, - /// indices)". This reordering exposes to optimizeMemoryInstruction more - /// optimization opportunities on loads and stores. Returns true if it changes - /// the program. - bool hoistAddrSpaceCastFromGEP(GEPOperator *GEP); + /// indices)". + bool hoistAddrSpaceCastFrom(Value *V, int Depth = 0); + /// Helper function for GEPs. + bool hoistAddrSpaceCastFromGEP(GEPOperator *GEP, int Depth); + /// Helper function for bitcasts. + bool hoistAddrSpaceCastFromBitCast(BitCastOperator *BC, int Depth); }; } @@ -85,11 +119,12 @@ "Remove unnecessary non-generic-to-generic addrspacecasts", false, false) -// Decides whether removing Cast is valid and beneficial. Cast can be an -// instruction or a constant expression. -static bool IsEliminableAddrSpaceCast(Operator *Cast) { - // Returns false if not even an addrspacecast. - if (Cast->getOpcode() != Instruction::AddrSpaceCast) +// Decides whether V is an addrspacecast and shortcutting V in load/store is +// valid and beneficial. +static bool isEliminableAddrSpaceCast(Value *V) { + // Returns false if V is not even an addrspacecast. + Operator *Cast = dyn_cast(V); + if (Cast == nullptr || Cast->getOpcode() != Instruction::AddrSpaceCast) return false; Value *Src = Cast->getOperand(0); @@ -108,67 +143,119 @@ DestTy->getAddressSpace() == AddressSpace::ADDRESS_SPACE_GENERIC); } -bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP( - GEPOperator *GEP) { - Operator *Cast = dyn_cast(GEP->getPointerOperand()); - if (!Cast) +bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP(GEPOperator *GEP, + int Depth) { + if (!hoistAddrSpaceCastFrom(GEP->getPointerOperand(), Depth + 1)) return false; - if (!IsEliminableAddrSpaceCast(Cast)) - return false; + // That hoistAddrSpaceCastFrom succeeds implies GEP's pointer operand has been + // replaced with an eliminable addrspacecast. + assert(isEliminableAddrSpaceCast(GEP->getPointerOperand())); + Operator *Cast = cast(GEP->getPointerOperand()); SmallVector Indices(GEP->idx_begin(), GEP->idx_end()); if (Instruction *GEPI = dyn_cast(GEP)) { - // %1 = gep (addrspacecast X), indices + // GEP = gep (addrspacecast X), indices // => - // %0 = gep X, indices - // %1 = addrspacecast %0 - GetElementPtrInst *NewGEPI = GetElementPtrInst::Create( + // NewGEP = gep X, indices + // NewASC = addrspacecast NewGEP + GetElementPtrInst *NewGEP = GetElementPtrInst::Create( GEP->getSourceElementType(), Cast->getOperand(0), Indices, - GEP->getName(), GEPI); - NewGEPI->setIsInBounds(GEP->isInBounds()); - GEP->replaceAllUsesWith( - new AddrSpaceCastInst(NewGEPI, GEP->getType(), "", GEPI)); + "", GEPI); + NewGEP->setIsInBounds(GEP->isInBounds()); + Value *NewASC = new AddrSpaceCastInst(NewGEP, GEP->getType(), "", GEPI); + NewASC->takeName(GEP); + GEP->replaceAllUsesWith(NewASC); } else { // GEP is a constant expression. - Constant *NewGEPCE = ConstantExpr::getGetElementPtr( + Constant *NewGEP = ConstantExpr::getGetElementPtr( GEP->getSourceElementType(), cast(Cast->getOperand(0)), Indices, GEP->isInBounds()); GEP->replaceAllUsesWith( - ConstantExpr::getAddrSpaceCast(NewGEPCE, GEP->getType())); + ConstantExpr::getAddrSpaceCast(NewGEP, GEP->getType())); } return true; } -bool NVPTXFavorNonGenericAddrSpaces::optimizeMemoryInstruction(Instruction *MI, - unsigned Idx) { - // If the pointer operand is a GEP, hoist the addrspacecast if any from the - // GEP to expose more optimization opportunites. - if (GEPOperator *GEP = dyn_cast(MI->getOperand(Idx))) { - hoistAddrSpaceCastFromGEP(GEP); - } +bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromBitCast( + BitCastOperator *BC, int Depth) { + if (!hoistAddrSpaceCastFrom(BC->getOperand(0), Depth + 1)) + return false; - // load/store (addrspacecast X) => load/store X if shortcutting the - // addrspacecast is valid and can improve performance. - // - // e.g., - // %1 = addrspacecast float addrspace(3)* %0 to float* - // %2 = load float* %1 - // -> - // %2 = load float addrspace(3)* %0 - // - // Note: the addrspacecast can also be a constant expression. - if (Operator *Cast = dyn_cast(MI->getOperand(Idx))) { - if (IsEliminableAddrSpaceCast(Cast)) { - MI->setOperand(Idx, Cast->getOperand(0)); - return true; - } + // That hoistAddrSpaceCastFrom succeeds implies BC's source operand has been + // replaced with an eliminable addrspacecast. + assert(isEliminableAddrSpaceCast(BC->getOperand(0))); + Operator *Cast = cast(BC->getOperand(0)); + + // Cast = addrspacecast Src + // BC = bitcast Cast + // => + // Cast' = bitcast Src + // BC' = addrspacecast Cast' + Value *Src = Cast->getOperand(0); + Type *TypeOfNewCast = + PointerType::get(BC->getType()->getPointerElementType(), + Src->getType()->getPointerAddressSpace()); + if (BitCastInst *BCI = dyn_cast(BC)) { + Value *NewCast = new BitCastInst(Src, TypeOfNewCast, "", BCI); + Value *NewBC = new AddrSpaceCastInst(NewCast, BC->getType(), "", BCI); + NewBC->takeName(BC); + BC->replaceAllUsesWith(NewBC); + } else { + // BC is a constant expression. + Constant *NewCast = + ConstantExpr::getBitCast(cast(Src), TypeOfNewCast); + Constant *NewBC = ConstantExpr::getAddrSpaceCast(NewCast, BC->getType()); + BC->replaceAllUsesWith(NewBC); } + return true; +} + +bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFrom(Value *V, + int Depth) { + // Returns true if V is already an eliminable addrspacecast. + if (isEliminableAddrSpaceCast(V)) + return true; + + // Limit the depth to prevent this recursive function from running too long. + static const int MaxDepth = 20; + if (Depth >= MaxDepth) + return false; + + // If V is a GEP or bitcast, hoist the addrspacecast if any from its pointer + // operand. This enables optimizeMemoryInstruction to shortcut addrspacecasts + // that are not directly used by the load/store. + if (GEPOperator *GEP = dyn_cast(V)) + return hoistAddrSpaceCastFromGEP(GEP, Depth); + + if (BitCastOperator *BC = dyn_cast(V)) + return hoistAddrSpaceCastFromBitCast(BC, Depth); return false; } +bool NVPTXFavorNonGenericAddrSpaces::optimizeMemoryInstruction(Instruction *MI, + unsigned Idx) { + if (hoistAddrSpaceCastFrom(MI->getOperand(Idx))) { + // load/store (addrspacecast X) => load/store X if shortcutting the + // addrspacecast is valid and can improve performance. + // + // e.g., + // %1 = addrspacecast float addrspace(3)* %0 to float* + // %2 = load float* %1 + // -> + // %2 = load float addrspace(3)* %0 + // + // Note: the addrspacecast can also be a constant expression. + assert(isEliminableAddrSpaceCast(MI->getOperand(Idx))); + Operator *ASC = dyn_cast(MI->getOperand(Idx)); + MI->setOperand(Idx, ASC->getOperand(0)); + return true; + } + return false; +} + bool NVPTXFavorNonGenericAddrSpaces::runOnFunction(Function &F) { if (DisableFavorNonGeneric) return false; Index: test/CodeGen/NVPTX/access-non-generic.ll =================================================================== --- test/CodeGen/NVPTX/access-non-generic.ll +++ test/CodeGen/NVPTX/access-non-generic.ll @@ -85,6 +85,22 @@ ret i32 %1 } +define i32 @ld_int_from_global_float(float addrspace(1)* %input, i32 %i, i32 %j) { +; IR-LABEL: @ld_int_from_global_float( +; PTX-LABEL: ld_int_from_global_float( + %1 = addrspacecast float addrspace(1)* %input to float* + %2 = getelementptr float, float* %1, i32 %i +; IR-NEXT: getelementptr float, float addrspace(1)* %input, i32 %i + %3 = getelementptr float, float* %2, i32 %j +; IR-NEXT: getelementptr float, float addrspace(1)* {{%[^,]+}}, i32 %j + %4 = bitcast float* %3 to i32* +; IR-NEXT: bitcast float addrspace(1)* {{%[^ ]+}} to i32 addrspace(1)* + %5 = load i32, i32* %4 +; IR-NEXT: load i32, i32 addrspace(1)* {{%.+}} +; PTX-LABEL: ld.global + ret i32 %5 +} + declare void @llvm.cuda.syncthreads() #3 attributes #3 = { noduplicate nounwind }