Index: llvm/lib/Target/AMDGPU/AMDGPU.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPU.h +++ llvm/lib/Target/AMDGPU/AMDGPU.h @@ -91,6 +91,10 @@ void initializeAMDGPULowerIntrinsicsPass(PassRegistry &); extern char &AMDGPULowerIntrinsicsID; +FunctionPass *createAMDGPULowerIntrinsicsFPass(); +void initializeAMDGPULowerIntrinsicsFPass(PassRegistry &); +extern char &AMDGPULowerIntrinsicsFID; + ModulePass *createAMDGPUCtorDtorLoweringPass(); void initializeAMDGPUCtorDtorLoweringPass(PassRegistry &); extern char &AMDGPUCtorDtorLoweringID; Index: llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -34,6 +34,47 @@ cl::init(1024), cl::Hidden); +// TODO: Should refine based on estimated number of accesses (e.g. does it +// require splitting based on alignment) +static bool shouldExpandOperationWithSize(Value *Size) { + ConstantInt *CI = dyn_cast(Size); + return !CI || (CI->getSExtValue() > MaxStaticSize); +} + +static bool expandMemIntrinsic(IntrinsicInst *Inst, + const TargetTransformInfo &TTI) { + switch (Inst->getIntrinsicID()) { + case Intrinsic::memcpy: { + auto *Memcpy = cast(Inst); + if (shouldExpandOperationWithSize(Memcpy->getLength())) { + expandMemCpyAsLoop(Memcpy, TTI); + return true; + } + + return false; + } + case Intrinsic::memmove: { + auto *Memmove = cast(Inst); + if (shouldExpandOperationWithSize(Memmove->getLength())) { + expandMemMoveAsLoop(Memmove); + return true; + } + + return false; + } + case Intrinsic::memset: { + auto *Memset = cast(Inst); + if (shouldExpandOperationWithSize(Memset->getLength())) { + expandMemSetAsLoop(Memset); + return true; + } + + return false; + } + default: + return false; + } +} class AMDGPULowerIntrinsics : public ModulePass { private: @@ -55,65 +96,49 @@ } }; +class AMDGPULowerIntrinsicsF : public FunctionPass { +private: + bool makeLIDRangeMetadata(Function &F) const; + +public: + static char ID; + + AMDGPULowerIntrinsicsF() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + bool expandMemIntrinsicUses(Function &F); + StringRef getPassName() const override { + return "AMDGPU Lower Intrinsics (function)"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + } +}; } +char AMDGPULowerIntrinsicsF::ID = 0; char AMDGPULowerIntrinsics::ID = 0; char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID; INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, "Lower intrinsics", false, false) - -// TODO: Should refine based on estimated number of accesses (e.g. does it -// require splitting based on alignment) -static bool shouldExpandOperationWithSize(Value *Size) { - ConstantInt *CI = dyn_cast(Size); - return !CI || (CI->getSExtValue() > MaxStaticSize); -} +INITIALIZE_PASS(AMDGPULowerIntrinsicsF, DEBUG_TYPE "-func", "Lower intrinsics", + false, false) bool AMDGPULowerIntrinsics::expandMemIntrinsicUses(Function &F) { - Intrinsic::ID ID = F.getIntrinsicID(); bool Changed = false; for (User *U : llvm::make_early_inc_range(F.users())) { - Instruction *Inst = cast(U); - - switch (ID) { - case Intrinsic::memcpy: { - auto *Memcpy = cast(Inst); - if (shouldExpandOperationWithSize(Memcpy->getLength())) { - Function *ParentFunc = Memcpy->getParent()->getParent(); - const TargetTransformInfo &TTI = - getAnalysis().getTTI(*ParentFunc); - expandMemCpyAsLoop(Memcpy, TTI); - Changed = true; - Memcpy->eraseFromParent(); - } - - break; - } - case Intrinsic::memmove: { - auto *Memmove = cast(Inst); - if (shouldExpandOperationWithSize(Memmove->getLength())) { - expandMemMoveAsLoop(Memmove); - Changed = true; - Memmove->eraseFromParent(); - } - - break; - } - case Intrinsic::memset: { - auto *Memset = cast(Inst); - if (shouldExpandOperationWithSize(Memset->getLength())) { - expandMemSetAsLoop(Memset); - Changed = true; - Memset->eraseFromParent(); - } - - break; - } - default: - break; + IntrinsicInst *Inst = cast(U); + Function *ParentFunc = Inst->getParent()->getParent(); + + const TargetTransformInfo &TTI = + getAnalysis().getTTI(*ParentFunc); + if (expandMemIntrinsic(Inst, TTI)) { + Inst->eraseFromParent(); + Changed = true; } } @@ -140,6 +165,26 @@ return Changed; } +bool AMDGPULowerIntrinsicsF::makeLIDRangeMetadata(Function &F) const { + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) + return false; + + const TargetMachine &TM = TPC->getTM(); + bool Changed = false; + + for (auto *U : F.users()) { + auto *CI = dyn_cast(U); + if (!CI) + continue; + + Function *Caller = CI->getParent()->getParent(); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, *Caller); + Changed |= ST.makeLIDRangeMetadata(CI); + } + return Changed; +} + bool AMDGPULowerIntrinsics::runOnModule(Module &M) { bool Changed = false; @@ -172,6 +217,58 @@ return Changed; } +bool AMDGPULowerIntrinsicsF::runOnFunction(Function &F) { + bool Changed = false; + + auto &TTI = getAnalysis().getTTI(F); + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) + return false; + + const TargetMachine &TM = TPC->getTM(); + + for (Function::iterator I = F.begin(), E = F.end(); I != E;) { + BasicBlock *BB = &*I++; + for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE;) { + IntrinsicInst *Intrin = dyn_cast(&*J++); + if (!Intrin) + continue; + + switch (Intrin->getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: + if (expandMemIntrinsic(Intrin, TTI)) { + BB = Intrin->getParent(); + JE = BB->end(); + Intrin->eraseFromParent(); + Changed = true; + } + + break; + case Intrinsic::r600_read_tidig_x: + case Intrinsic::r600_read_tidig_y: + case Intrinsic::r600_read_tidig_z: + case Intrinsic::r600_read_local_size_x: + case Intrinsic::r600_read_local_size_y: + case Intrinsic::r600_read_local_size_z: { + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); + Changed |= ST.makeLIDRangeMetadata(Intrin); + break; + } + default: + break; + } + } + } + + return Changed; +} + ModulePass *llvm::createAMDGPULowerIntrinsicsPass() { return new AMDGPULowerIntrinsics(); } + +FunctionPass *llvm::createAMDGPULowerIntrinsicsFPass() { + return new AMDGPULowerIntrinsicsF(); +} Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -367,6 +367,7 @@ initializeAMDGPUPromoteKernelArgumentsPass(*PR); initializeAMDGPULowerKernelAttributesPass(*PR); initializeAMDGPULowerIntrinsicsPass(*PR); + initializeAMDGPULowerIntrinsicsFPass(*PR); initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); initializeAMDGPUPostLegalizerCombinerPass(*PR); initializeAMDGPUPreLegalizerCombinerPass(*PR); @@ -1030,7 +1031,7 @@ // A call to propagate attributes pass in the backend in case opt was not run. addPass(createAMDGPUPropagateAttributesEarlyPass(&TM)); - addPass(createAMDGPULowerIntrinsicsPass()); + addPass(createAMDGPULowerIntrinsicsFPass()); // Function calls are not supported, so make sure we inline everything. addPass(createAMDGPUAlwaysInlinePass()); Index: llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll +++ llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=1024 %s | FileCheck -check-prefixes=OPT,MAX1024 %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefixes=OPT,ALL %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-lower-intrinsics-func -amdgpu-mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefixes=OPT,ALL %s declare void @llvm.memcpy.p1i8.p1i8.i64(ptr addrspace(1) nocapture, ptr addrspace(1) nocapture readonly, i64, i1) #1 declare void @llvm.memcpy.p1i8.p3i8.i32(ptr addrspace(1) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1