Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7294,9 +7294,33 @@ continue; } + bool NoCallIntrinsic = false; + if (isa(&*PrevInstIt)) { + auto *II = dyn_cast(&*PrevInstIt); + CallInst *CI = cast(II); + SmallVector Tys; + for (auto &ArgOp : CI->args()) + Tys.push_back(ArgOp->getType()); + + if (II->isAssumeLikeIntrinsic()) + NoCallIntrinsic = true; + else { + FastMathFlags FMF; + if (auto *FPMO = dyn_cast(II)) + FMF = FPMO->getFastMathFlags(); + IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys, + FMF); + InstructionCost IntrCost = + TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput); + InstructionCost CallCost = TTI->getCallInstrCost( + nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput); + if (IntrCost < CallCost) + NoCallIntrinsic = true; + } + } + // Debug information does not impact spill cost. - if ((isa(&*PrevInstIt) && - !isa(&*PrevInstIt)) && + if ((isa(&*PrevInstIt) && !NoCallIntrinsic) && &*PrevInstIt != PrevInst) NumCalls++; Index: llvm/test/Transforms/SLPVectorizer/AArch64/fmulladd.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/SLPVectorizer/AArch64/fmulladd.ll @@ -0,0 +1,75 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=slp-vectorizer -mtriple=aarch64-unknown-unknown -mcpu=cortex-a53 -S | FileCheck %s +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-unknown" + +@b = common global i32 0, align 4 +@a = common global ptr null, align 8 + +define void @foo(ptr nocapture noundef readonly %d, ptr nocapture noundef readonly %e) { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x double], ptr [[D:%.*]], i64 2 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x double], ptr [[D]], i64 0, i64 3 +; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x double], ptr [[E:%.*]], i64 3 +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds [4 x double], ptr [[D]], i64 1, i64 3 +; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX15]], align 8 +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [4 x double], ptr [[D]], i64 3, i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX17]], align 8 +; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds [4 x double], ptr [[E]], i64 2, i64 3 +; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[ARRAYIDX19]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP2]], double [[TMP3]], double [[TMP1]]) +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [4 x double], ptr [[D]], i64 3, i64 3 +; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[ARRAYIDX21]], align 8 +; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [4 x double], ptr [[E]], i64 3, i64 3 +; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr [[ARRAYIDX23]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP5]], double [[TMP6]], double [[TMP4]]) +; CHECK-NEXT: [[TMP8:%.*]] = load ptr, ptr @a, align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> poison, double [[TMP0]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[SHUFFLE]], <2 x double> [[TMP10]], <2 x double> [[TMP9]]) +; CHECK-NEXT: store <2 x double> [[TMP12]], ptr [[TMP8]], align 8 +; CHECK-NEXT: [[F_SROA_539_0__SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 40 +; CHECK-NEXT: store double [[TMP7]], ptr [[F_SROA_539_0__SROA_IDX]], align 8 +; CHECK-NEXT: store i32 6, ptr @b, align 4 +; CHECK-NEXT: ret void +; +entry: + %arrayidx = getelementptr inbounds [4 x double], ptr %d, i64 2 + %0 = load double, ptr %arrayidx, align 8 + %arrayidx3 = getelementptr inbounds [4 x double], ptr %d, i64 0, i64 3 + %1 = load double, ptr %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds [4 x double], ptr %e, i64 3 + %2 = load double, ptr %arrayidx4, align 8 + %3 = tail call double @llvm.fmuladd.f64(double %1, double %2, double %0) + %arrayidx8 = getelementptr inbounds [4 x double], ptr %d, i64 2, i64 1 + %4 = load double, ptr %arrayidx8, align 8 + %arrayidx12 = getelementptr inbounds [4 x double], ptr %e, i64 3, i64 1 + %5 = load double, ptr %arrayidx12, align 8 + %6 = tail call double @llvm.fmuladd.f64(double %1, double %5, double %4) + %arrayidx15 = getelementptr inbounds [4 x double], ptr %d, i64 1, i64 3 + %7 = load double, ptr %arrayidx15, align 8 + %arrayidx17 = getelementptr inbounds [4 x double], ptr %d, i64 3, i64 2 + %8 = load double, ptr %arrayidx17, align 8 + %arrayidx19 = getelementptr inbounds [4 x double], ptr %e, i64 2, i64 3 + %9 = load double, ptr %arrayidx19, align 8 + %10 = tail call double @llvm.fmuladd.f64(double %8, double %9, double %7) + %arrayidx21 = getelementptr inbounds [4 x double], ptr %d, i64 3, i64 3 + %11 = load double, ptr %arrayidx21, align 8 + %arrayidx23 = getelementptr inbounds [4 x double], ptr %e, i64 3, i64 3 + %12 = load double, ptr %arrayidx23, align 8 + %13 = tail call double @llvm.fmuladd.f64(double %11, double %12, double %10) + %14 = load ptr, ptr @a, align 8 + store double %3, ptr %14, align 8 + %f.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %14, i64 8 + store double %6, ptr %f.sroa.4.0..sroa_idx, align 8 + %f.sroa.539.0..sroa_idx = getelementptr inbounds i8, ptr %14, i64 40 + store double %13, ptr %f.sroa.539.0..sroa_idx, align 8 + store i32 6, ptr @b, align 4 + ret void +} + +declare double @llvm.fmuladd.f64(double, double, double) #1