diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -68,6 +68,7 @@ FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetMachine *); FunctionPass *createAMDGPUUseNativeCallsPass(); FunctionPass *createAMDGPUCodeGenPreparePass(); +FunctionPass *createAMDGPULateCodeGenPreparePass(); FunctionPass *createAMDGPUMachineCFGStructurizerPass(); FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *); ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *); @@ -223,6 +224,9 @@ void initializeAMDGPUCodeGenPreparePass(PassRegistry&); extern char &AMDGPUCodeGenPrepareID; +void initializeAMDGPULateCodeGenPreparePass(PassRegistry &); +extern char &AMDGPULateCodeGenPrepareID; + void initializeSIAnnotateControlFlowPass(PassRegistry&); extern char &SIAnnotateControlFlowPassID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -0,0 +1,198 @@ +//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass does misc. AMDGPU optimizations on IR *just* before instruction +/// selection. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/KnownBits.h" +#include "llvm/Transforms/Utils/Local.h" +#include +#include + +#define DEBUG_TYPE "amdgpu-late-codegenprepare" + +using namespace llvm; + +// Scalar load widening needs running after load-store-vectorizer as that pass +// doesn't handle overlapping cases. In addition, this pass enhances the +// widening to handle cases where scalar sub-dword loads are naturally aligned +// only but not dword aligned. +static cl::opt + WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", + cl::desc("Widen sub-dword constant address space loads in " + "AMDGPULateCodeGenPrepare"), + cl::ReallyHidden, cl::init(true)); + +namespace { + +class AMDGPULateCodeGenPrepare + : public FunctionPass, + public InstVisitor { + Module *Mod = nullptr; + const DataLayout *DL = nullptr; + + AssumptionCache *AC = nullptr; + LegacyDivergenceAnalysis *DA = nullptr; + +public: + static char ID; + + AMDGPULateCodeGenPrepare() : FunctionPass(ID) {} + + StringRef getPassName() const override { + return "AMDGPU IR late optimizations"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); + } + + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + + bool visitInstruction(Instruction &) { return false; } + + // Check if the specified value is at least DWORD aligned. + bool isDWORDAligned(const Value *V) const { + KnownBits Known = computeKnownBits(V, *DL, 0, AC); + return Known.countMinTrailingZeros() >= 2; + } + + bool canWidenScalarExtLoad(LoadInst &LI) const; + bool visitLoadInst(LoadInst &LI); +}; + +} // end anonymous namespace + +bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) { + Mod = &M; + DL = &Mod->getDataLayout(); + return false; +} + +bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + AC = &getAnalysis().getAssumptionCache(F); + DA = &getAnalysis(); + + bool Changed = false; + for (auto &BB : F) + for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) { + Instruction *I = &*BI++; + Changed |= visit(*I); + } + + return Changed; +} + +bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const { + unsigned AS = LI.getPointerAddressSpace(); + // Skip non-constant address space. + if (AS != AMDGPUAS::CONSTANT_ADDRESS && + AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) + return false; + // Skip non-simple loads. + if (!LI.isSimple()) + return false; + auto *Ty = LI.getType(); + // Skip aggregate types. + if (Ty->isAggregateType()) + return false; + unsigned TySize = DL->getTypeStoreSize(Ty); + // Only handle sub-DWORD loads. + if (TySize >= 4) + return false; + // That load must be at least naturally aligned. + if (LI.getAlign() < DL->getABITypeAlign(Ty)) + return false; + // It should be uniform, i.e. a scalar load. + return DA->isUniform(&LI); +} + +bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { + if (!WidenLoads) + return false; + + // Skip if that load is already aligned on DWORD at least as it's handled in + // SDAG. + if (LI.getAlign() >= 4) + return false; + + if (!canWidenScalarExtLoad(LI)) + return false; + + int64_t Offset = 0; + auto *Base = + GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL); + // If that base is not DWORD aligned, it's not safe to perform the following + // transforms. + if (!isDWORDAligned(Base)) + return false; + + int64_t Adjust = Offset & 0x3; + if (Adjust == 0) { + // With a zero adjust, the original alignment could be promoted with a + // better one. + LI.setAlignment(Align(4)); + return true; + } + + IRBuilder<> IRB(&LI); + IRB.SetCurrentDebugLocation(LI.getDebugLoc()); + + unsigned AS = LI.getPointerAddressSpace(); + unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8; + auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits); + + PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS); + PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS); + auto *NewPtr = IRB.CreateBitCast( + IRB.CreateConstGEP1_64(IRB.CreateBitCast(Base, Int8PtrTy), + Offset - Adjust), + Int32PtrTy); + LoadInst *NewLd = IRB.CreateAlignedLoad(NewPtr, Align(4)); + NewLd->copyMetadata(LI); + NewLd->setMetadata(LLVMContext::MD_range, nullptr); + + unsigned ShAmt = Adjust * 8; + auto *NewVal = IRB.CreateBitCast( + IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType()); + LI.replaceAllUsesWith(NewVal); + RecursivelyDeleteTriviallyDeadInstructions(&LI); + + return true; +} + +INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE, + "AMDGPU IR late optimizations", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) +INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE, + "AMDGPU IR late optimizations", false, false) + +char AMDGPULateCodeGenPrepare::ID = 0; + +FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() { + return new AMDGPULateCodeGenPrepare(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -236,6 +236,7 @@ initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUPromoteAllocaToVectorPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); + initializeAMDGPULateCodeGenPreparePass(*PR); initializeAMDGPUPropagateAttributesEarlyPass(*PR); initializeAMDGPUPropagateAttributesLatePass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); @@ -865,6 +866,7 @@ bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); + addPass(createAMDGPULateCodeGenPreparePass()); if (EnableAtomicOptimizations) { addPass(createAMDGPUAtomicOptimizerPass()); } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -58,6 +58,7 @@ AMDGPUISelDAGToDAG.cpp AMDGPUISelLowering.cpp AMDGPUGlobalISelUtils.cpp + AMDGPULateCodeGenPrepare.cpp AMDGPULegalizerInfo.cpp AMDGPULibCalls.cpp AMDGPULibFunc.cpp diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll @@ -14,6 +14,22 @@ ret void } +; GCN-LABEL: {{^}}test2 +; GCN: enable_sgpr_dispatch_ptr = 1 +; GCN: s_load_dword s[[REG:[0-9]+]], s[4:5], 0x1 +; GCN: s_lshr_b32 s{{[0-9]+}}, s[[REG]], 16 +; GCN-NOT: load_ushort +; GCN: s_endpgm +define amdgpu_kernel void @test2(i32 addrspace(1)* %out) { + %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 + %d1 = getelementptr inbounds i8, i8 addrspace(4)* %dispatch_ptr, i64 6 + %h1 = bitcast i8 addrspace(4)* %d1 to i16 addrspace(4)* + %v1 = load i16, i16 addrspace(4)* %h1 + %e1 = zext i16 %v1 to i32 + store i32 %e1, i32 addrspace(1)* %out + ret void +} + declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll b/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll --- a/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll @@ -22,6 +22,37 @@ ret void } +; A little more complicated case where more sub-dword loads could be coalesced +; if they are not widening earlier. +; GCN-LABEL: {{^}}load_4i16: +; GCN: s_load_dwordx2 s{{\[}}[[D0:[0-9]+]]:[[D1:[0-9]+]]{{\]}}, s[4:5], 0x4 +; GCN-NOT: s_load_dword {{s[0-9]+}}, s[4:5], 0x4 +; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s[[D0]], 16 +; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s[[D1]], 16 +; GCN: s_endpgm +define protected amdgpu_kernel void @load_4i16(i32 addrspace(1)* %out) { +entry: + %disp = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %gep_x = getelementptr i8, i8 addrspace(4)* %disp, i64 4 + %gep_x.cast = bitcast i8 addrspace(4)* %gep_x to i16 addrspace(4)* + %id_x = load i16, i16 addrspace(4)* %gep_x.cast, align 4, !invariant.load !0 ; load workgroup size x + %gep_y = getelementptr i8, i8 addrspace(4)* %disp, i64 6 + %gep_y.cast = bitcast i8 addrspace(4)* %gep_y to i16 addrspace(4)* + %id_y = load i16, i16 addrspace(4)* %gep_y.cast, align 2, !invariant.load !0 ; load workgroup size y + %gep_z = getelementptr i8, i8 addrspace(4)* %disp, i64 8 + %gep_z.cast = bitcast i8 addrspace(4)* %gep_z to i16 addrspace(4)* + %id_z = load i16, i16 addrspace(4)* %gep_z.cast, align 4, !invariant.load !0 ; load workgroup size x + %gep_w = getelementptr i8, i8 addrspace(4)* %disp, i64 10 + %gep_w.cast = bitcast i8 addrspace(4)* %gep_w to i16 addrspace(4)* + %id_w = load i16, i16 addrspace(4)* %gep_w.cast, align 2, !invariant.load !0 ; load workgroup size y + %add = add nuw nsw i16 %id_y, %id_x + %add2 = add nuw nsw i16 %id_z, %id_w + %add3 = add nuw nsw i16 %add, %add2 + %conv = zext i16 %add3 to i32 + store i32 %conv, i32 addrspace(1)* %out, align 4 + ret void +} + declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() !0 = !{!0}