Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -53,7 +53,8 @@ FunctionPass *createSIInsertWaitcntsPass(); FunctionPass *createSIPreAllocateWWMRegsPass(); FunctionPass *createSIFormMemoryClausesPass(); -FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &); +FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &, + const TargetMachine *); FunctionPass *createAMDGPUUseNativeCallsPass(); FunctionPass *createAMDGPUCodeGenPreparePass(); FunctionPass *createAMDGPUMachineCFGStructurizerPass(); Index: lib/Target/AMDGPU/AMDGPULibCalls.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -15,6 +15,7 @@ #include "AMDGPU.h" #include "AMDGPULibFunc.h" +#include "AMDGPUSubtarget.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/Loads.h" #include "llvm/ADT/StringSet.h" @@ -22,6 +23,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" @@ -29,6 +31,7 @@ #include "llvm/IR/ValueSymbolTable.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include #include @@ -65,6 +68,8 @@ typedef llvm::AMDGPULibFunc FuncInfo; + const TargetMachine *TM; + // -fuse-native. bool AllNative = false; @@ -134,6 +139,9 @@ // __read_pipe/__write_pipe bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, FuncInfo &FInfo); + // llvm.amdgcn.wavefrontsize + bool fold_wavefrontsize(CallInst *CI, IRBuilder<> &B); + // Get insertion point at entry. BasicBlock::iterator getEntryIns(CallInst * UI); // Insert an Alloc instruction. @@ -152,6 +160,8 @@ } public: + AMDGPULibCalls(const TargetMachine *TM_ = nullptr) : TM(TM_) {} + bool fold(CallInst *CI, AliasAnalysis *AA = nullptr); void initNativeFuncs(); @@ -166,15 +176,16 @@ class AMDGPUSimplifyLibCalls : public FunctionPass { - AMDGPULibCalls Simplifier; - const TargetOptions Options; + AMDGPULibCalls Simplifier; + public: static char ID; // Pass identification - AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions()) - : FunctionPass(ID), Options(Opt) { + AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions(), + const TargetMachine *TM = nullptr) + : FunctionPass(ID), Options(Opt), Simplifier(TM) { initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry()); } @@ -639,14 +650,6 @@ // Ignore indirect calls. if (Callee == 0) return false; - FuncInfo FInfo; - if (!parseFunctionName(Callee->getName(), &FInfo)) - return false; - - // Further check the number of arguments to see if they match. - if (CI->getNumArgOperands() != FInfo.getNumArgs()) - return false; - BasicBlock *BB = CI->getParent(); LLVMContext &Context = CI->getParent()->getContext(); IRBuilder<> B(Context); @@ -658,6 +661,21 @@ if (const FPMathOperator *FPOp = dyn_cast(CI)) B.setFastMathFlags(FPOp->getFastMathFlags()); + switch (Callee->getIntrinsicID()) { + default: + break; + case Intrinsic::amdgcn_wavefrontsize: + return !EnablePreLink && fold_wavefrontsize(CI, B); + } + + FuncInfo FInfo; + if (!parseFunctionName(Callee->getName(), &FInfo)) + return false; + + // Further check the number of arguments to see if they match. + if (CI->getNumArgOperands() != FInfo.getNumArgs()) + return false; + if (TDOFold(CI, FInfo)) return true; @@ -1371,6 +1389,29 @@ return true; } +bool AMDGPULibCalls::fold_wavefrontsize(CallInst *CI, IRBuilder<> &B) { + if (!TM) + return false; + + StringRef CPU = TM->getTargetCPU(); + StringRef Features = TM->getTargetFeatureString(); + if ((CPU.empty() || CPU.equals_lower("generic")) && + (Features.empty() || + Features.find_lower("wavefrontsize") == StringRef::npos)) + return false; + + Function *F = CI->getParent()->getParent(); + const GCNSubtarget &ST = TM->getSubtarget(*F); + unsigned N = ST.getWavefrontSize(); + + LLVM_DEBUG(errs() << "AMDIC: fold_wavefrontsize (" << *CI << ") with " + << N << "\n"); + + CI->replaceAllUsesWith(ConstantInt::get(B.getInt32Ty(), N)); + CI->eraseFromParent(); + return true; +} + // Get insertion point at entry. BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) { Function * Func = UI->getParent()->getParent(); @@ -1680,8 +1721,9 @@ } // Public interface to the Simplify LibCalls pass. -FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt) { - return new AMDGPUSimplifyLibCalls(Opt); +FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt, + const TargetMachine *TM) { + return new AMDGPUSimplifyLibCalls(Opt, TM); } FunctionPass *llvm::createAMDGPUUseNativeCallsPass() { Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -420,15 +420,15 @@ const auto &Opt = Options; Builder.addExtension( PassManagerBuilder::EP_EarlyAsPossible, - [AMDGPUAA, LibCallSimplify, &Opt](const PassManagerBuilder &, - legacy::PassManagerBase &PM) { + [AMDGPUAA, LibCallSimplify, &Opt, this](const PassManagerBuilder &, + legacy::PassManagerBase &PM) { if (AMDGPUAA) { PM.add(createAMDGPUAAWrapperPass()); PM.add(createAMDGPUExternalAAWrapperPass()); } PM.add(llvm::createAMDGPUUseNativeCallsPass()); if (LibCallSimplify) - PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt)); + PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt, this)); }); Builder.addExtension( Index: test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll @@ -0,0 +1,84 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s + +; RUN: opt -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-WXX %s +; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-WXX %s +; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+WavefrontSize32 -S < %s | FileCheck -check-prefixes=OPT,OPT-W32 %s +; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+WavefrontSize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32,-wavefrontsize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W32 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=-wavefrontsize32,+wavefrontsize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s + +; GCN-LABEL: {{^}}fold_wavefrontsize: +; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize( + +; W32: v_mov_b32_e32 [[V:v[0-9]+]], 32 +; W64: v_mov_b32_e32 [[V:v[0-9]+]], 64 +; GCN: store_dword v[{{[0-9:]+}}], [[V]] + +; OPT-W32: store i32 32, i32 addrspace(1)* %arg, align 4 +; OPT-W64: store i32 64, i32 addrspace(1)* %arg, align 4 +; OPT-WXX: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() +; OPT-WXX: store i32 %tmp, i32 addrspace(1)* %arg, align 4 +; OPT-NEXT: ret void + +define amdgpu_kernel void @fold_wavefrontsize(i32 addrspace(1)* nocapture %arg) { +bb: + %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 + store i32 %tmp, i32 addrspace(1)* %arg, align 4 + ret void +} + +; GCN-LABEL: {{^}}fold_and_optimize_wavefrontsize: +; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( + +; W32: v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}} +; W64: v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}} +; GCN-NOT: cndmask +; GCN: store_dword v[{{[0-9:]+}}], [[V]] + +; OPT-W32: store i32 1, i32 addrspace(1)* %arg, align 4 +; OPT-W64: store i32 2, i32 addrspace(1)* %arg, align 4 +; OPT-WXX: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() +; OPT-WXX: %tmp1 = icmp ugt i32 %tmp, 32 +; OPT-WXX: %tmp2 = select i1 %tmp1, i32 2, i32 1 +; OPT-WXX: store i32 %tmp2, i32 addrspace(1)* %arg +; OPT-NEXT: ret void + +define amdgpu_kernel void @fold_and_optimize_wavefrontsize(i32 addrspace(1)* nocapture %arg) { +bb: + %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 + %tmp1 = icmp ugt i32 %tmp, 32 + %tmp2 = select i1 %tmp1, i32 2, i32 1 + store i32 %tmp2, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize: +; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( + +; OPT: bb: +; OPT-WXX: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() +; OPT-WXX: %tmp1 = icmp ugt i32 %tmp, 32 +; OPT-WXX: bb3: +; OPT-W64: store i32 1, i32 addrspace(1)* %arg, align 4 +; OPT-NEXT: ret void + +define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(i32 addrspace(1)* nocapture %arg) { +bb: + %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 + %tmp1 = icmp ugt i32 %tmp, 32 + br i1 %tmp1, label %bb2, label %bb3 + +bb2: ; preds = %bb + store i32 1, i32 addrspace(1)* %arg, align 4 + br label %bb3 + +bb3: ; preds = %bb2, %bb + ret void +} + +declare i32 @llvm.amdgcn.wavefrontsize() #0 + +attributes #0 = { nounwind readnone speculatable }