Index: lib/Transforms/InstCombine/InstCombineAddSub.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -182,6 +182,7 @@ Value *createFAdd(Value *Opnd0, Value *Opnd1); Value *createFMul(Value *Opnd0, Value *Opnd1); Value *createFDiv(Value *Opnd0, Value *Opnd1); + Value *createFMA(Value *Opnd0, Value *Opnd1, Value *Opnd2, bool IsFMulAdd); Value *createFNeg(Value *V); Value *createNaryFAdd(const AddendVect& Opnds, unsigned InstrQuota); void createInstPostProc(Instruction *NewInst, bool NoNumber = false); @@ -426,9 +427,37 @@ I->getOpcode() == Instruction::FSub) && "Expect add/sub"); Instruction *I0 = dyn_cast(I->getOperand(0)); - Instruction *I1 = dyn_cast(I->getOperand(1)); + if (!I0) + return nullptr; + + if (IntrinsicInst *II0 = dyn_cast(I0)) { + // (fadd (fmuladd x, y, (fmul u, v), z) -> (fmuladd x, y (fmuladd u, v, z)) + // + // This could be done for fma, but whether that is faster is target + // dependent. + if (I->getOpcode() == Instruction::FAdd && + II0->hasUnsafeAlgebra() && + II0->getIntrinsicID() == Intrinsic::fmuladd && + II0->hasOneUse()) { + Value *U, *V; + Value *FMA2 = II0->getArgOperand(2); + if (!FMA2->hasOneUse() || + !match(FMA2, m_FMul(m_Value(U), m_Value(V))) || + !cast(FMA2)->hasUnsafeAlgebra()) + return nullptr; + + Value *X = II0->getArgOperand(0); + Value *Y = II0->getArgOperand(1); + Value *Z = I->getOperand(1); + Value *FMAUVZ = createFMA(U, V, Z, true); + return createFMA(X, Y, FMAUVZ, true); + } - if (!I0 || !I1 || I0->getOpcode() != I1->getOpcode()) + return nullptr; + } + + Instruction *I1 = dyn_cast(I->getOperand(1)); + if (!I1 || I0->getOpcode() != I1->getOpcode()) return nullptr; bool isMpy = false; @@ -769,6 +798,18 @@ return V; } +Value *FAddCombine::createFMA(Value *Opnd0, Value *Opnd1, Value *Opnd2, + bool FMulAdd) { + Module *M = Builder->GetInsertBlock()->getParent()->getParent(); + Intrinsic::ID Opc = FMulAdd ? Intrinsic::fmuladd : Intrinsic::fma; + Value *F = Intrinsic::getDeclaration(M, Opc, Opnd0->getType()); + + Value *V = Builder->CreateCall(F, { Opnd0, Opnd1, Opnd2 }); + if (Instruction *I = dyn_cast(V)) + createInstPostProc(I); + return V; +} + void FAddCombine::createInstPostProc(Instruction *NewInstr, bool NoNumber) { NewInstr->setDebugLoc(Instr->getDebugLoc()); Index: test/Transforms/InstCombine/fmuladd-opt.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/fmuladd-opt.ll @@ -0,0 +1,160 @@ +; RUN: opt -S -instcombine < %s | FileCheck %s +; (fadd (fma x, y, (fmul u, v), z) -> (fma x, y (fma u, v, z)) + +; CHECK-LABEL: @fast_add_fmuladd_fmul( +; CHECK: %1 = call fast float @llvm.fmuladd.f32(float %u, float %v, float %z) +; CHECK-NEXT: %2 = call fast float @llvm.fmuladd.f32(float %x, float %y, float %1) +; CHECK-NEXT: ret float %2 +define float @fast_add_fmuladd_fmul(float %x, float %y, float %z, float %u, float %v) { + %mul.u.v = fmul fast float %u, %v + %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) + %add = fadd fast float %fma, %z + ret float %add +} + +; CHECK-LABEL: @fast_sub_fmuladd_fmul( +; CHECK: %mul.u.v = fmul fast float %u, %v +; CHECK-NEXT: %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) +; CHECK-NEXT: %add = fsub fast float %fma, %z +; CHECK-NEXT: ret float %add +define float @fast_sub_fmuladd_fmul(float %x, float %y, float %z, float %u, float %v) { + %mul.u.v = fmul fast float %u, %v + %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) + %add = fsub fast float %fma, %z + ret float %add +} + +; CHECK-LABEL: @fast_add_fmuladd_fmul_multi_use_mul( +; CHECK: fmul fast +; CHECK: call fast float @llvm.fmuladd.f32( +; CHECK: fadd fast +define float @fast_add_fmuladd_fmul_multi_use_mul(float %x, float %y, float %z, float %u, float %v) { + %mul.u.v = fmul fast float %u, %v + store volatile float %mul.u.v, float* undef + %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) + %add = fadd fast float %fma, %z + ret float %add +} + +; CHECK-LABEL: @fast_add_fmuladd_fmul_multi_use_fmuladd( +; CHECK: fmul fast +; CHECK: call fast float @llvm.fmuladd.f32( +; CHECK: fadd fast +define float @fast_add_fmuladd_fmul_multi_use_fmuladd(float %x, float %y, float %z, float %u, float %v) { + %mul.u.v = fmul fast float %u, %v + %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) + store volatile float %fma, float* undef + %add = fadd fast float %fma, %z + ret float %add +} + +; CHECK-LABEL: @fast_add_fmuladd_fmul_constant_x( +; CHECK: %1 = call fast float @llvm.fmuladd.f32(float %u, float %v, float %z) +; CHECK-NEXT: %2 = call fast float @llvm.fmuladd.f32(float %y, float 8.000000e+00, float %1) +; CHECK-NEXT: ret float %2 +define float @fast_add_fmuladd_fmul_constant_x(float %y, float %z, float %u, float %v) { + %mul.u.v = fmul fast float %u, %v + %fma = call fast float @llvm.fmuladd.f32(float 8.0, float %y, float %mul.u.v) + %add = fadd fast float %fma, %z + ret float %add +} + +; CHECK-LABEL: @fast_add_fmuladd_fmul_constant_y( +; CHECK: %1 = call fast float @llvm.fmuladd.f32(float %u, float %v, float %z) +; CHECK-NEXT: %2 = call fast float @llvm.fmuladd.f32(float %x, float 4.000000e+00, float %1) +; CHECK-NEXT: ret float %2 +define float @fast_add_fmuladd_fmul_constant_y(float %x, float %z, float %u, float %v) { + %mul.u.v = fmul fast float %u, %v + %fma = call fast float @llvm.fmuladd.f32(float %x, float 4.0, float %mul.u.v) + %add = fadd fast float %fma, %z + ret float %add +} + +; CHECK-LABEL: @fast_add_fmuladd_fmul_constant_v( +; CHECK: %1 = call fast float @llvm.fmuladd.f32(float %u, float 4.000000e+00, float %z) +; CHECK-NEXT: %2 = call fast float @llvm.fmuladd.f32(float %x, float %y, float %1) +; CHECK-NEXT: ret float %2 +define float @fast_add_fmuladd_fmul_constant_v(float %x, float %y, float %z, float %u) { + %mul.u.v = fmul fast float %u, 4.0 + %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) + %add = fadd fast float %fma, %z + ret float %add +} + +; CHECK-LABEL: @fast_add_fmuladd_fmul_constant_z( +; CHECK: %1 = call fast float @llvm.fmuladd.f32(float %u, float %v, float 4.000000e+00) +; CHECK-NEXT: %2 = call fast float @llvm.fmuladd.f32(float %x, float %y, float %1) +; CHECK-NEXT: ret float %2 +define float @fast_add_fmuladd_fmul_constant_z(float %x, float %y, float %u, float %v) { + %mul.u.v = fmul fast float %u, %v + %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) + %add = fadd fast float %fma, 4.0 + ret float %add +} + +; CHECK-LABEL: @missing_fast_add_fmuladd_fmul_0( +; CHECK: fmul float %u, %v +; CHECK-NEXT: %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) +; CHECK-NEXT: %add = fadd fast float %fma, %z +; CHECK-NEXT: ret float %add +define float @missing_fast_add_fmuladd_fmul_0(float %x, float %y, float %z, float %u, float %v) { + %mul.u.v = fmul float %u, %v + %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) + %add = fadd fast float %fma, %z + ret float %add +} + +; CHECK-LABEL: @missing_fast_add_fmuladd_fmul_1( +; CHECK: %mul.u.v = fmul fast float %u, %v +; CHECK-NEXT: %fma = call float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) +; CHECK-NEXT: %add = fadd fast float %fma, %z +; CHECK-NEXT: ret float %add +define float @missing_fast_add_fmuladd_fmul_1(float %x, float %y, float %z, float %u, float %v) { + %mul.u.v = fmul fast float %u, %v + %fma = call float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) + %add = fadd fast float %fma, %z + ret float %add +} + +; CHECK-LABEL: @missing_fast_add_fmuladd_fmul_2( +; CHECK: %mul.u.v = fmul fast float %u, %v +; CHECK-NEXT: %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) +; CHECK-NEXT: %add = fadd float %fma, %z +; CHECK-NEXT: ret float %add +define float @missing_fast_add_fmuladd_fmul_2(float %x, float %y, float %z, float %u, float %v) { + %mul.u.v = fmul fast float %u, %v + %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) + %add = fadd float %fma, %z + ret float %add +} + +; CHECK-LABEL: @safe_add_fmuladd_fmul( +; CHECK: %mul.u.v = fmul float %u, %v +; CHECK-NEXT: %fma = call float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) +; CHECK-NEXT: %add = fadd float %fma, %z +define float @safe_add_fmuladd_fmul(float %x, float %y, float %z, float %u, float %v) { + %mul.u.v = fmul float %u, %v + %fma = call float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) + %add = fadd float %fma, %z + ret float %add +} + +; This is not done because it depends on the target whether it is +; profitable. + +; CHECK-LABEL: @fast_add_fma_fmul( +; CHECK-NEXT: %mul.u.v = fmul fast float %u, %v +; CHECK-NEXT: %fmuladd = call fast float @llvm.fma.f32(float %x, float %y, float %mul.u.v) +; CHECK-NEXT: %add = fadd fast float %fmuladd, %z +; CHECK-NEXT: ret float %add +define float @fast_add_fma_fmul(float %x, float %y, float %z, float %u, float %v) { + %mul.u.v = fmul fast float %u, %v + %fmuladd = call fast float @llvm.fma.f32(float %x, float %y, float %mul.u.v) + %add = fadd fast float %fmuladd, %z + ret float %add +} + +declare float @llvm.fma.f32(float, float, float) #0 +declare float @llvm.fmuladd.f32(float, float, float) #0 + +attributes #0 = { nounwind readnone }