Index: lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp =================================================================== --- lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp +++ lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp @@ -8,35 +8,41 @@ //===----------------------------------------------------------------------===// // // When a load/store accesses the generic address space, checks whether the -// address is casted from a non-generic address space. If so, remove this -// addrspacecast because accessing non-generic address spaces is typically -// faster. Besides seeking addrspacecasts, this optimization also traces into -// the base pointer of a GEP. +// address is casted from a non-generic address space. If so, avoid casting +// the address space because accessing non-generic address spaces is typically +// faster. If the address of the load/store is a GEP, we also look into the base +// of the GEP for redundant addrspacecasts. // // For instance, the code below loads a float from an array allocated in // addrspace(3). // -// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]* -// %1 = gep [10 x float]* %0, i64 0, i64 %i +// %0 = addrspacecast [10 x float] addrspace(3)* @a to [5 x float]* +// %1 = gep [5 x float]* %0, i64 0, i64 %i // %2 = load float* %1 ; emits ld.f32 // -// First, function hoistAddrSpaceCastFromGEP reorders the addrspacecast -// and the GEP to expose more optimization opportunities to function -// optimizeMemoryInst. The intermediate code looks like: +// First, function hoistAddrSpaceCastFromGEP reorders the addrspacecast and the +// GEP to expose more optimization opportunities to function optimizeMemoryInst. +// Note that if the addrspacecast changes the underlying element type, we need +// to leave a bitcast where the addrspacecast was. // -// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i -// %1 = addrspacecast float addrspace(3)* %0 to float* -// %2 = load float* %1 ; still emits ld.f32, but will be optimized shortly +// The intermediate code from the above example looks like: +// +// %0 = bitcast [10 x float] addrspace(3)* @a to [5 x float] addrspace(3)* +// %1 = gep [5 x float] addrspace(3)* %0, i64 0, i64 %i +// %2 = addrspacecast float addrspace(3)* %1 to float* +// %3 = load float* %2 ; still emits ld.f32, but will be optimized shortly // // Then, function optimizeMemoryInstruction detects a load from addrspacecast'ed // generic pointers, and folds the load and the addrspacecast into a load from // the original address space. The final code looks like: // -// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i -// %2 = load float addrspace(3)* %0 ; emits ld.shared.f32 +// %0 = bitcast [10 x float] addrspace(3)* @a to [5 x float] addrspace(3)* +// %1 = gep [5 x float] addrspace(3)* %0, i64 0, i64 %i +// %3 = load float addrspace(3)* %1 ; emits ld.shared.f32 // -// This pass may remove an addrspacecast in a different BB. Therefore, we -// implement it as a FunctionPass. +// The instructions that compute the address of a load/store may happen in a +// different basic block from the load/store. Therefore, we implement this +// transformation as a FunctionPass. // //===----------------------------------------------------------------------===// @@ -68,10 +74,10 @@ /// Optimizes load/store instructions. Idx is the index of the pointer operand /// (0 for load, and 1 for store). Returns true if it changes anything. bool optimizeMemoryInstruction(Instruction *I, unsigned Idx); - /// Transforms "gep (addrspacecast X), indices" into "addrspacecast (gep X, - /// indices)". This reordering exposes to optimizeMemoryInstruction more - /// optimization opportunities on loads and stores. Returns true if it changes - /// the program. + /// Transforms "gep (addrspacecast X), indices" into "addrspacecast (gep + /// (bitcast X, indices)". Reording gep and addrspacecast exposes to + /// optimizeMemoryInstruction more optimization opportunities on loads and + /// stores. Returns true if it changes the program. bool hoistAddrSpaceCastFromGEP(GEPOperator *GEP); }; } @@ -85,9 +91,16 @@ "Remove unnecessary non-generic-to-generic addrspacecasts", false, false) -// Decides whether removing Cast is valid and beneficial. Cast can be an -// instruction or a constant expression. -static bool IsEliminableAddrSpaceCast(Operator *Cast) { +// Decides whether eliminating the addrspacecast is beneficial. If the source +// and destination of the addrspacecast have different element types, the caller +// will later demote the addrspacecast to a bitcast instead of completely +// removing it. In that case, this function outputs the type of this +// intermediate bitcast as IntermediateType. +// +// The parameter Cast can be either an AddressSpaceCastInst or an addrspacecast +// ConstantExpr. +static bool IsEliminableAddrSpaceCast(Operator *Cast, + PointerType *&IntermediateType) { // Returns false if not even an addrspacecast. if (Cast->getOpcode() != Instruction::AddrSpaceCast) return false; @@ -95,12 +108,15 @@ Value *Src = Cast->getOperand(0); PointerType *SrcTy = cast(Src->getType()); PointerType *DestTy = cast(Cast->getType()); - // TODO: For now, we only handle the case where the addrspacecast only changes - // the address space but not the type. If the type also changes, we could - // still get rid of the addrspacecast by adding an extra bitcast, but we - // rarely see such scenarios. - if (SrcTy->getElementType() != DestTy->getElementType()) - return false; + + if (SrcTy->getElementType() != DestTy->getElementType()) { + // addrspacecast can change not only the address space but also the + // underlying element type of the source pointer. If it converts the element + // type, we will later demote the addrspacecast to a bitcast and compute the + // type of this intermediate bitcast here as preparation. + IntermediateType = + PointerType::get(DestTy->getElementType(), SrcTy->getAddressSpace()); + } // Checks whether the addrspacecast is from a non-generic address space to the // generic address space. @@ -108,34 +124,39 @@ DestTy->getAddressSpace() == AddressSpace::ADDRESS_SPACE_GENERIC); } -bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP( - GEPOperator *GEP) { +bool +NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP(GEPOperator *GEP) { Operator *Cast = dyn_cast(GEP->getPointerOperand()); if (!Cast) return false; - if (!IsEliminableAddrSpaceCast(Cast)) + PointerType *BitCastedType = nullptr; + if (!IsEliminableAddrSpaceCast(Cast, BitCastedType)) return false; SmallVector Indices(GEP->idx_begin(), GEP->idx_end()); if (Instruction *GEPI = dyn_cast(GEP)) { + Value *Src = Cast->getOperand(0); // %1 = gep (addrspacecast X), indices // => - // %0 = gep X, indices + // %0 = gep (bitcast X), indices // %1 = addrspacecast %0 - GetElementPtrInst *NewGEPI = GetElementPtrInst::Create(Cast->getOperand(0), - Indices, - GEP->getName(), - GEPI); + if (BitCastedType != nullptr) { + Src = new BitCastInst(Src, BitCastedType, "", GEPI); + } + GetElementPtrInst *NewGEPI = + GetElementPtrInst::Create(Src, Indices, GEP->getName(), GEPI); NewGEPI->setIsInBounds(GEP->isInBounds()); GEP->replaceAllUsesWith( new AddrSpaceCastInst(NewGEPI, GEP->getType(), "", GEPI)); } else { - // GEP is a constant expression. - Constant *NewGEPCE = ConstantExpr::getGetElementPtr( - cast(Cast->getOperand(0)), - Indices, - GEP->isInBounds()); + // GEP is a ConstantExpr, so its operand Cast is also a ConstantExpr. + Constant *Src = cast(Cast)->getOperand(0); + if (BitCastedType != nullptr) { + Src = ConstantExpr::getBitCast(Src, BitCastedType); + } + Constant *NewGEPCE = + ConstantExpr::getGetElementPtr(Src, Indices, GEP->isInBounds()); GEP->replaceAllUsesWith( ConstantExpr::getAddrSpaceCast(NewGEPCE, GEP->getType())); } @@ -151,19 +172,25 @@ hoistAddrSpaceCastFromGEP(GEP); } - // load/store (addrspacecast X) => load/store X if shortcutting the - // addrspacecast is valid and can improve performance. + // load/store (addrspacecast X) => load/store (bitcast X) if using the + // original address space is faster. // // e.g., - // %1 = addrspacecast float addrspace(3)* %0 to float* - // %2 = load float* %1 + // %1 = addrspacecast float addrspace(3)* %0 to int* + // %2 = load int* %1 // -> - // %2 = load float addrspace(3)* %0 + // %1 = bitcast float addrspace(3)* %0 to int addrspace(3)* + // %2 = load int addrspace(3)* %1 // - // Note: the addrspacecast can also be a constant expression. + // Note: the addrspacecast can also be a ConstantExpr. if (Operator *Cast = dyn_cast(MI->getOperand(Idx))) { - if (IsEliminableAddrSpaceCast(Cast)) { - MI->setOperand(Idx, Cast->getOperand(0)); + PointerType *BitCastedType = nullptr; + if (IsEliminableAddrSpaceCast(Cast, BitCastedType)) { + Value *Src = Cast->getOperand(0); + if (BitCastedType != nullptr) { + Src = new BitCastInst(Src, BitCastedType, "", MI); + } + MI->setOperand(Idx, Src); return true; } } Index: test/CodeGen/NVPTX/access-non-generic.ll =================================================================== --- test/CodeGen/NVPTX/access-non-generic.ll +++ test/CodeGen/NVPTX/access-non-generic.ll @@ -2,6 +2,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix PTX ; RUN: opt < %s -S -nvptx-favor-non-generic -dce | FileCheck %s --check-prefix IR +%struct.float2 = type { float, float } + @array = internal addrspace(3) global [10 x float] zeroinitializer, align 4 @scalar = internal addrspace(3) global float 0.000000e+00, align 4 @@ -78,13 +80,36 @@ ; different element types. define i32 @ld_int_from_float() { ; IR-LABEL: @ld_int_from_float -; IR: addrspacecast ; PTX-LABEL: ld_int_from_float( -; PTX: cvta.shared.u{{(32|64)}} %1 = load i32* addrspacecast(float addrspace(3)* @scalar to i32*), align 4 +; IR: [[ADDR:%[0-9]+]] = bitcast +; IR-NEXT: load i32 addrspace(3)* [[ADDR]] +; PTX: ld.shared.u32 {{%r[0-9]+}}, [scalar] ret i32 %1 } +define float @float_array_to_float2(i32 %i) { +; IR-LABEL: @float_array_to_float2 +; PTX-LABEL: float_array_to_float2 +; addrspacecast; gep; load; => bitcast; gep; load + %p = addrspacecast [10 x float] addrspace(3)* @array to %struct.float2* + %p2 = getelementptr inbounds %struct.float2* %p, i32 0, i32 1 + %1 = load float* %p2, align 4 +; IR: bitcast [10 x float] addrspace(3)* @array to %struct.float2 addrspace(3)* +; IR-NEXT: getelementptr inbounds %struct.float2 addrspace(3)* +; IR-NEXT: load float addrspace(3)* +; PTX: ld.shared.f32 {{%f[0-9]+}}, [array+4] + +; load (gep (addrspacecast)) => load (gep (bitcast)) + %2 = load float* getelementptr inbounds (%struct.float2* addrspacecast ([10 x float] addrspace(3)* @array to %struct.float2*), i32 0, i32 0), align 4 +; IR: bitcast [10 x float] addrspace(3)* @array to float addrspace(3)* +; IR-NEXT: load float addrspace(3)* +; PTX: ld.shared.f32 {{%f[0-9]+}}, [array] + + %sum = fadd float %1, %2 + ret float %sum +} + declare void @llvm.cuda.syncthreads() #3 attributes #3 = { noduplicate nounwind }