Index: llvm/include/llvm/CodeGen/CodeGenPassBuilder.h =================================================================== --- llvm/include/llvm/CodeGen/CodeGenPassBuilder.h +++ llvm/include/llvm/CodeGen/CodeGenPassBuilder.h @@ -579,7 +579,7 @@ if (TM.useEmulatedTLS()) addPass(LowerEmuTLSPass()); - addPass(PreISelIntrinsicLoweringPass()); + addPass(PreISelIntrinsicLoweringPass(TM)); derived().addIRPasses(addPass); derived().addCodeGenPrepare(addPass); Index: llvm/include/llvm/CodeGen/PreISelIntrinsicLowering.h =================================================================== --- llvm/include/llvm/CodeGen/PreISelIntrinsicLowering.h +++ llvm/include/llvm/CodeGen/PreISelIntrinsicLowering.h @@ -18,9 +18,13 @@ namespace llvm { class Module; +class TargetMachine; struct PreISelIntrinsicLoweringPass : PassInfoMixin { + const TargetMachine &TM; + + PreISelIntrinsicLoweringPass(const TargetMachine &TM) : TM(TM) {} PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); }; Index: llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp =================================================================== --- llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp +++ llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp @@ -14,9 +14,10 @@ #include "llvm/CodeGen/PreISelIntrinsicLowering.h" #include "llvm/Analysis/ObjCARCInstKind.h" #include "llvm/Analysis/ObjCARCUtil.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -26,6 +27,7 @@ #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/LowerMemIntrinsics.h" using namespace llvm; @@ -41,19 +43,19 @@ namespace { struct PreISelIntrinsicLowering { + const TargetMachine &TM; const function_ref LookupTTI; - const function_ref LookupLibInfo; /// If this is true, assume it's preferably to leave memory intrinsic calls /// for replacement with a library call later. Otherwise this depends on - /// TargetLibraryInfo availability of the corresponding function. + /// TargetLoweringInfo availability of the corresponding function. const bool UseMemIntrinsicLibFunc; explicit PreISelIntrinsicLowering( + const TargetMachine &TM_, function_ref LookupTTI_, - function_ref LookupLibInfo_, bool UseMemIntrinsicLibFunc_ = true) - : LookupTTI(LookupTTI_), LookupLibInfo(LookupLibInfo_), + : TM(TM_), LookupTTI(LookupTTI_), UseMemIntrinsicLibFunc(UseMemIntrinsicLibFunc_) {} static bool shouldExpandMemIntrinsicWithSize(Value *Size, @@ -195,9 +197,15 @@ return SizeVal > Threshold || Threshold == 0; } +static bool canEmitLibcall(const TargetLowering &TLI, RTLIB::Libcall LC) { + // TODO: Should this consider the address space of the memcpy? + return TLI.getLibcallName(LC) != nullptr; +} + // TODO: Handle atomic memcpy and memcpy.inline // TODO: Pass ScalarEvolution bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const { + const TargetLowering *TLI = TM.getSubtargetImpl(F)->getTargetLowering(); Intrinsic::ID ID = F.getIntrinsicID(); bool Changed = false; @@ -210,10 +218,10 @@ Function *ParentFunc = Memcpy->getFunction(); const TargetTransformInfo &TTI = LookupTTI(*ParentFunc); if (shouldExpandMemIntrinsicWithSize(Memcpy->getLength(), TTI)) { - if (UseMemIntrinsicLibFunc && - LookupLibInfo(*ParentFunc).has(LibFunc_memcpy)) + if (UseMemIntrinsicLibFunc && canEmitLibcall(*TLI, RTLIB::MEMCPY)) break; + // TODO: For optsize, emit the loop into a separate function expandMemCpyAsLoop(Memcpy, TTI); Changed = true; Memcpy->eraseFromParent(); @@ -226,8 +234,7 @@ Function *ParentFunc = Memmove->getFunction(); const TargetTransformInfo &TTI = LookupTTI(*ParentFunc); if (shouldExpandMemIntrinsicWithSize(Memmove->getLength(), TTI)) { - if (UseMemIntrinsicLibFunc && - LookupLibInfo(*ParentFunc).has(LibFunc_memmove)) + if (UseMemIntrinsicLibFunc && canEmitLibcall(*TLI, RTLIB::MEMMOVE)) break; if (expandMemMoveAsLoop(Memmove, TTI)) { @@ -243,8 +250,7 @@ Function *ParentFunc = Memset->getFunction(); const TargetTransformInfo &TTI = LookupTTI(*ParentFunc); if (shouldExpandMemIntrinsicWithSize(Memset->getLength(), TTI)) { - if (UseMemIntrinsicLibFunc && - LookupLibInfo(*Memset->getFunction()).has(LibFunc_memset)) + if (UseMemIntrinsicLibFunc && canEmitLibcall(*TLI, RTLIB::MEMSET)) break; expandMemSetAsLoop(Memset); @@ -365,8 +371,8 @@ PreISelIntrinsicLoweringLegacyPass() : ModulePass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); AU.addRequired(); + AU.addRequired(); } bool runOnModule(Module &M) override { @@ -374,11 +380,8 @@ return this->getAnalysis().getTTI(F); }; - auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & { - return this->getAnalysis().getTLI(F); - }; - - PreISelIntrinsicLowering Lowering(LookupTTI, LookupTLI); + const auto &TM = getAnalysis().getTM(); + PreISelIntrinsicLowering Lowering(TM, LookupTTI); return Lowering.lowerIntrinsics(M); } }; @@ -387,27 +390,28 @@ char PreISelIntrinsicLoweringLegacyPass::ID; -INITIALIZE_PASS(PreISelIntrinsicLoweringLegacyPass, - "pre-isel-intrinsic-lowering", "Pre-ISel Intrinsic Lowering", - false, false) +INITIALIZE_PASS_BEGIN(PreISelIntrinsicLoweringLegacyPass, + "pre-isel-intrinsic-lowering", + "Pre-ISel Intrinsic Lowering", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(PreISelIntrinsicLoweringLegacyPass, + "pre-isel-intrinsic-lowering", + "Pre-ISel Intrinsic Lowering", false, false) ModulePass *llvm::createPreISelIntrinsicLoweringPass() { - return new PreISelIntrinsicLoweringLegacyPass; + return new PreISelIntrinsicLoweringLegacyPass(); } PreservedAnalyses PreISelIntrinsicLoweringPass::run(Module &M, ModuleAnalysisManager &AM) { auto &FAM = AM.getResult(M).getManager(); - auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & { - return FAM.getResult(F); - }; - auto LookupTTI = [&FAM](Function &F) -> TargetTransformInfo & { return FAM.getResult(F); }; - PreISelIntrinsicLowering Lowering(LookupTTI, LookupTLI); + PreISelIntrinsicLowering Lowering(TM, LookupTTI); if (!Lowering.lowerIntrinsics(M)) return PreservedAnalyses::all(); else Index: llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -71,9 +71,6 @@ // rootn bool fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo); - // fma/mad - bool fold_fma_mad(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); - // -fuse-native for sincos bool sincosUseNative(CallInst *aCI, const FuncInfo &FInfo); @@ -581,11 +578,6 @@ case AMDGPULibFunc::EI_COS: case AMDGPULibFunc::EI_SIN: return fold_sincos(FPOp, B, FInfo); - case AMDGPULibFunc::EI_FMA: - case AMDGPULibFunc::EI_MAD: - case AMDGPULibFunc::EI_NFMA: - // skip vector function - return (getVecSize(FInfo) != 1) ? false : fold_fma_mad(CI, B, FInfo); default: break; } @@ -1016,50 +1008,6 @@ return false; } -bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B, - const FuncInfo &FInfo) { - Value *opr0 = CI->getArgOperand(0); - Value *opr1 = CI->getArgOperand(1); - Value *opr2 = CI->getArgOperand(2); - - ConstantFP *CF0 = dyn_cast(opr0); - ConstantFP *CF1 = dyn_cast(opr1); - if ((CF0 && CF0->isZero()) || (CF1 && CF1->isZero())) { - // fma/mad(a, b, c) = c if a=0 || b=0 - LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr2 << "\n"); - replaceCall(CI, opr2); - return true; - } - if (CF0 && CF0->isExactlyValue(1.0f)) { - // fma/mad(a, b, c) = b+c if a=1 - LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr1 << " + " << *opr2 - << "\n"); - Value *nval = B.CreateFAdd(opr1, opr2, "fmaadd"); - replaceCall(CI, nval); - return true; - } - if (CF1 && CF1->isExactlyValue(1.0f)) { - // fma/mad(a, b, c) = a+c if b=1 - LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " + " << *opr2 - << "\n"); - Value *nval = B.CreateFAdd(opr0, opr2, "fmaadd"); - replaceCall(CI, nval); - return true; - } - if (ConstantFP *CF = dyn_cast(opr2)) { - if (CF->isZero()) { - // fma/mad(a, b, c) = a*b if c=0 - LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " * " - << *opr1 << "\n"); - Value *nval = B.CreateFMul(opr0, opr1, "fmamul"); - replaceCall(CI, nval); - return true; - } - } - - return false; -} - // Get a scalar native builtin single argument FP function FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M, const FuncInfo &FInfo) { Index: llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll +++ llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll @@ -471,7 +471,7 @@ } ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_0x -; GCN: store float %y, ptr addrspace(1) %a +; GCN: %call = tail call fast float @_Z3fmafff(float 0.000000e+00, float %tmp, float %y) define amdgpu_kernel void @test_fma_0x(ptr addrspace(1) nocapture %a, float %y) { entry: %tmp = load float, ptr addrspace(1) %a, align 4 @@ -483,7 +483,7 @@ declare float @_Z3fmafff(float, float, float) ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_x0 -; GCN: store float %y, ptr addrspace(1) %a +; GCN: %call = tail call fast float @_Z3fmafff(float %tmp, float 0.000000e+00, float %y) define amdgpu_kernel void @test_fma_x0(ptr addrspace(1) nocapture %a, float %y) { entry: %tmp = load float, ptr addrspace(1) %a, align 4 @@ -493,7 +493,7 @@ } ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_mad_0x -; GCN: store float %y, ptr addrspace(1) %a +; GCN: %call = tail call fast float @_Z3madfff(float 0.000000e+00, float %tmp, float %y) define amdgpu_kernel void @test_mad_0x(ptr addrspace(1) nocapture %a, float %y) { entry: %tmp = load float, ptr addrspace(1) %a, align 4 @@ -505,7 +505,7 @@ declare float @_Z3madfff(float, float, float) ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_mad_x0 -; GCN: store float %y, ptr addrspace(1) %a +; GCN: %call = tail call fast float @_Z3madfff(float %tmp, float 0.000000e+00, float %y) define amdgpu_kernel void @test_mad_x0(ptr addrspace(1) nocapture %a, float %y) { entry: %tmp = load float, ptr addrspace(1) %a, align 4 @@ -515,7 +515,7 @@ } ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_x1y -; GCN: %fmaadd = fadd fast float %tmp, %y +; GCN: %call = tail call fast float @_Z3fmafff(float %tmp, float 1.000000e+00, float %y) define amdgpu_kernel void @test_fma_x1y(ptr addrspace(1) nocapture %a, float %y) { entry: %tmp = load float, ptr addrspace(1) %a, align 4 @@ -525,7 +525,7 @@ } ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_1xy -; GCN: %fmaadd = fadd fast float %tmp, %y +; GCN: %call = tail call fast float @_Z3fmafff(float 1.000000e+00, float %tmp, float %y) define amdgpu_kernel void @test_fma_1xy(ptr addrspace(1) nocapture %a, float %y) { entry: %tmp = load float, ptr addrspace(1) %a, align 4 @@ -535,7 +535,7 @@ } ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_xy0 -; GCN: %fmamul = fmul fast float %tmp1, %tmp +; GCN: %call = tail call fast float @_Z3fmafff(float %tmp, float %tmp1, float 0.000000e+00) define amdgpu_kernel void @test_fma_xy0(ptr addrspace(1) nocapture %a) { entry: %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1 Index: llvm/test/CodeGen/ARM/no-expand-memcpy-no-builtins.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/ARM/no-expand-memcpy-no-builtins.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=thumbv7em-apple-unknown-macho < %s | FileCheck %s + +target datalayout = "e-m:o-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + +declare void @llvm.memcpy.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg) #0 +declare void @llvm.memmove.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg) #0 +declare void @llvm.memset.p0.i32(ptr noalias nocapture writeonly, i8, i32, i1 immarg) #0 + +; Check we don't expand memcpy to a loop when the caller +; even if we have no-builtins attached. + +; CHECK: bl _memcpy +define arm_aapcs_vfpcc void @test_memcpy(ptr %p1, ptr %p2) #1 { + call void @llvm.memcpy.p0.p0.i32(ptr %p1, ptr %p2, i32 128, i1 false) + ret void +} + +; CHECK: bl _memmove +define arm_aapcs_vfpcc void @test_memmove(ptr %p1, ptr %p2) #1 { + call void @llvm.memmove.p0.p0.i32(ptr %p1, ptr %p2, i32 128, i1 false) + ret void +} + +; CHECK: bl _memset +define arm_aapcs_vfpcc void @test_memset(ptr %p1) #1 { + call void @llvm.memset.p0.i32(ptr %p1, i8 0, i32 128, i1 false) + ret void +} + +attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +attributes #1 = { "no-builtins" } Index: llvm/test/Transforms/PreISelIntrinsicLowering/X86/lit.local.cfg =================================================================== --- /dev/null +++ llvm/test/Transforms/PreISelIntrinsicLowering/X86/lit.local.cfg @@ -0,0 +1,2 @@ +if not "X86" in config.root.targets: + config.unsupported = True Index: llvm/test/Transforms/PreISelIntrinsicLowering/X86/load-relative.ll =================================================================== --- llvm/test/Transforms/PreISelIntrinsicLowering/X86/load-relative.ll +++ llvm/test/Transforms/PreISelIntrinsicLowering/X86/load-relative.ll @@ -1,4 +1,4 @@ -; RUN: opt -pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s +; RUN: opt -mtriple=x86_64-pc-linux-gnu -pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s ; CHECK: define ptr @foo32(ptr [[P:%.*]], i32 [[O:%.*]]) define ptr @foo32(ptr %p, i32 %o) { Index: llvm/test/Transforms/PreISelIntrinsicLowering/X86/objc-arc.ll =================================================================== --- llvm/test/Transforms/PreISelIntrinsicLowering/X86/objc-arc.ll +++ llvm/test/Transforms/PreISelIntrinsicLowering/X86/objc-arc.ll @@ -1,4 +1,4 @@ -; RUN: opt -pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s +; RUN: opt -mtriple=x86_64-pc-linux-gnu -pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s ; Make sure calls to the objc intrinsics are translated to calls in to the ; runtime