diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -67,6 +67,7 @@ FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetMachine *); FunctionPass *createAMDGPUUseNativeCallsPass(); FunctionPass *createAMDGPUCodeGenPreparePass(); +FunctionPass *createAMDGPULateCodeGenPreparePass(); FunctionPass *createAMDGPUMachineCFGStructurizerPass(); FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *); ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *); @@ -225,6 +226,9 @@ void initializeAMDGPUCodeGenPreparePass(PassRegistry&); extern char &AMDGPUCodeGenPrepareID; +void initializeAMDGPULateCodeGenPreparePass(PassRegistry &); +extern char &AMDGPULateCodeGenPrepareID; + void initializeSIAnnotateControlFlowPass(PassRegistry&); extern char &SIAnnotateControlFlowPassID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -0,0 +1,193 @@ +//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass does misc. AMDGPU optimizations on IR *just* before instruction +/// selection. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/KnownBits.h" +#include "llvm/Transforms/Utils/Local.h" +#include +#include + +#define DEBUG_TYPE "amdgpu-late-codegenprepare" + +using namespace llvm; + +static cl::opt + WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", + cl::desc("Widen sub-dword constant address space loads in " + "AMDGPULateCodeGenPrepare"), + cl::ReallyHidden, cl::init(true)); + +namespace { + +class AMDGPULateCodeGenPrepare + : public FunctionPass, + public InstVisitor { + Module *Mod = nullptr; + const DataLayout *DL = nullptr; + + AssumptionCache *AC = nullptr; + LegacyDivergenceAnalysis *DA = nullptr; + +public: + static char ID; + + AMDGPULateCodeGenPrepare() : FunctionPass(ID) {} + + StringRef getPassName() const override { + return "AMDGPU IR late optimizations"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); + } + + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + + bool visitInstruction(Instruction &) { return false; } + + // Check if the specified value is at least DWORD aligned. + bool isDWORDAligned(const Value *V) const { + KnownBits Known = computeKnownBits(V, *DL, 0, AC); + return Known.countMinTrailingZeros() >= 2; + } + + bool canWidenScalarExtLoad(LoadInst &LI) const; + bool visitLoadInst(LoadInst &LI); +}; + +} // end anonymous namespace + +bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) { + Mod = &M; + DL = &Mod->getDataLayout(); + return false; +} + +bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + AC = &getAnalysis().getAssumptionCache(F); + DA = &getAnalysis(); + + bool Changed = false; + for (auto &BB : F) + for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) { + Instruction *I = &*BI++; + Changed |= visit(*I); + } + + return Changed; +} + +bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const { + unsigned AS = LI.getPointerAddressSpace(); + // Skip non-constant address space. + if (AS != AMDGPUAS::CONSTANT_ADDRESS && + AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) + return false; + // Skip non-simple loads. + if (!LI.isSimple()) + return false; + auto Ty = LI.getType(); + // Skip aggregate types. + if (Ty->isAggregateType()) + return false; + unsigned TySize = DL->getTypeStoreSize(Ty); + // Only handle sub-DWORD loads. + if (TySize >= 4) + return false; + // That load must be at least naturally aligned. + if (LI.getAlign() < DL->getABITypeAlign(Ty)) + return false; + // It should be uniform, i.e. a scalar load. + return DA->isUniform(&LI); +} + +bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { + if (!WidenLoads) + return false; + + // Skip if that load is already aligned on DWORD at least as it's handled in + // SDAG. + if (LI.getAlign() >= 4) + return false; + + if (!canWidenScalarExtLoad(LI)) + return false; + + int64_t Offset = 0; + auto Base = + GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL); + // If that base is not DWORD aligned, it's not safe to perform the following + // transforms. + if (!isDWORDAligned(Base)) + return false; + + int64_t Adjust = Offset & 0x3; + if (Adjust == 0) { + // With a zero adjust, the original alignment could be promoted with a + // better one. + LI.setAlignment(Align(4)); + return true; + } + + IRBuilder<> IRB(&LI); + IRB.SetCurrentDebugLocation(LI.getDebugLoc()); + + unsigned AS = LI.getPointerAddressSpace(); + unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8; + auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits); + + PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS); + PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS); + auto NewPtr = IRB.CreateBitCast( + IRB.CreateConstGEP1_64(IRB.CreateBitCast(Base, Int8PtrTy), + Offset - Adjust), + Int32PtrTy); + LoadInst *NewLd = IRB.CreateAlignedLoad(NewPtr, Align(4)); + NewLd->copyMetadata(LI); + NewLd->setMetadata(LLVMContext::MD_range, nullptr); + + unsigned ShAmt = Adjust * 8; + auto NewVal = IRB.CreateBitCast( + IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType()); + LI.replaceAllUsesWith(NewVal); + RecursivelyDeleteTriviallyDeadInstructions(&LI); + + return true; +} + +INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE, + "AMDGPU IR late optimizations", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) +INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE, + "AMDGPU IR late optimizations", false, false) + +char AMDGPULateCodeGenPrepare::ID = 0; + +FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() { + return new AMDGPULateCodeGenPrepare(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -237,6 +237,7 @@ initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUPromoteAllocaToVectorPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); + initializeAMDGPULateCodeGenPreparePass(*PR); initializeAMDGPUPropagateAttributesEarlyPass(*PR); initializeAMDGPUPropagateAttributesLatePass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); @@ -790,6 +791,9 @@ } bool AMDGPUPassConfig::addPreISel() { + const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); + if (TM.getTargetTriple().getArch() == Triple::amdgcn) + addPass(createAMDGPULateCodeGenPreparePass()); addPass(createLowerSwitchPass()); addPass(createFlattenCFGPass()); return false; diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -51,6 +51,7 @@ AMDGPUISelDAGToDAG.cpp AMDGPUISelLowering.cpp AMDGPUGlobalISelUtils.cpp + AMDGPULateCodeGenPrepare.cpp AMDGPULegalizerInfo.cpp AMDGPULibCalls.cpp AMDGPULibFunc.cpp diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll @@ -14,6 +14,48 @@ ret void } +; GCN-LABEL: {{^}}test2 +; GCN: enable_sgpr_dispatch_ptr = 1 +; GCN: s_load_dword s[[REG:[0-9]+]], s[4:5], 0x1 +; GCN: s_lshr_b32 s{{[0-9]+}}, s[[REG]], 16 +; GCN-NOT: load_ushort +; GCN: s_endpgm +define amdgpu_kernel void @test2(i32 addrspace(1)* %out) { + %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 + %d1 = getelementptr inbounds i8, i8 addrspace(4)* %dispatch_ptr, i64 6 + %h1 = bitcast i8 addrspace(4)* %d1 to i16 addrspace(4)* + %v1 = load i16, i16 addrspace(4)* %h1 + %e1 = zext i16 %v1 to i32 + store i32 %e1, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test3 +; GCN: enable_sgpr_dispatch_ptr = 1 +; GCN: s_load_dword s[[REG:[0-9]+]], s[4:5], 0x1 +; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s[[REG]], 16 +; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[REG]], 0xffff +; GCN-NOT: s_load_dwor s{{[0-9]+}}, s[4:5] +; GCN-NOT: load_ushort +; GCN: s_endpgm +; In this test, two sub-dword loads are both widened into the effectively same +; dword load: one is done in SDAG and the other is in the late IR opt. However, +; SDAG should CSE them into a single one. +define amdgpu_kernel void @test3(i32 addrspace(1)* %out) { + %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 + %d1 = getelementptr inbounds i8, i8 addrspace(4)* %dispatch_ptr, i64 4 + %h1 = bitcast i8 addrspace(4)* %d1 to i16 addrspace(4)* + %v1 = load i16, i16 addrspace(4)* %h1 + %e1 = zext i16 %v1 to i32 + %d2 = getelementptr inbounds i8, i8 addrspace(4)* %dispatch_ptr, i64 6 + %h2 = bitcast i8 addrspace(4)* %d2 to i16 addrspace(4)* + %v2 = load i16, i16 addrspace(4)* %h2 + %e2 = zext i16 %v2 to i32 + %o = add i32 %e1, %e2 + store i32 %o, i32 addrspace(1)* %out + ret void +} + declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 attributes #0 = { readnone }