diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8329,6 +8329,8 @@ return nullptr; auto willWiden = [&](ElementCount VF) -> bool { + if (VF.isScalar()) + return false; Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // The following case may be scalarized depending on the VF. // The flag shows whether we use Intrinsic or a usual Call for vectorized diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll @@ -1139,10 +1139,10 @@ ; CHECK-UNORDERED: [[LOAD5:%.*]] = load float, float* ; CHECK-UNORDERED: [[LOAD6:%.*]] = load float, float* ; CHECK-UNORDERED: [[LOAD7:%.*]] = load float, float* -; CHECK-UNORDERED: [[FMULADD]] = call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD4]], float [[VEC_PHI]]) -; CHECK-UNORDERED: [[FMULADD1]] = call float @llvm.fmuladd.f32(float [[LOAD1]], float [[LOAD5]], float [[VEC_PHI1]]) -; CHECK-UNORDERED: [[FMULADD2]] = call float @llvm.fmuladd.f32(float [[LOAD2]], float [[LOAD6]], float [[VEC_PHI2]]) -; CHECK-UNORDERED: [[FMULADD3]] = call float @llvm.fmuladd.f32(float [[LOAD3]], float [[LOAD7]], float [[VEC_PHI3]]) +; CHECK-UNORDERED: [[FMULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD4]], float [[VEC_PHI]]) +; CHECK-UNORDERED: [[FMULADD1]] = tail call float @llvm.fmuladd.f32(float [[LOAD1]], float [[LOAD5]], float [[VEC_PHI1]]) +; CHECK-UNORDERED: [[FMULADD2]] = tail call float @llvm.fmuladd.f32(float [[LOAD2]], float [[LOAD6]], float [[VEC_PHI2]]) +; CHECK-UNORDERED: [[FMULADD3]] = tail call float @llvm.fmuladd.f32(float [[LOAD3]], float [[LOAD7]], float [[VEC_PHI3]]) ; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd ; CHECK-UNORDERED: middle.block: ; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd float [[FMULADD1]], [[FMULADD]] diff --git a/llvm/test/Transforms/LoopVectorize/interleave-with-call.ll b/llvm/test/Transforms/LoopVectorize/interleave-with-call.ll --- a/llvm/test/Transforms/LoopVectorize/interleave-with-call.ll +++ b/llvm/test/Transforms/LoopVectorize/interleave-with-call.ll @@ -12,7 +12,7 @@ ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: vp<[[IV_STEPS:%.]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<%start>, ir<1> -; CHECK-NEXT: WIDEN-CALL ir<%min> = call @llvm.smin.i32(vp<[[IV_STEPS]]>, ir<65535>) +; CHECK-NEXT: CLONE ir<%min> = call vp<[[IV_STEPS]]>, ir<65535>, ir<@llvm.smin.i32> ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%dst>, vp<[[IV_STEPS]]> ; CHECK-NEXT: CLONE store ir<%min>, ir<%arrayidx> ; CHECK-NEXT: EMIT vp<[[INC:%.+]]> = VF * UF +(nuw) vp<[[CAN_IV]]> @@ -27,8 +27,8 @@ ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 %start, [[INDEX]] ; CHECK-NEXT: [[INDUCTION:%.*]] = add i32 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[INDUCTION1:%.*]] = add i32 [[OFFSET_IDX]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.smin.i32(i32 [[INDUCTION]], i32 65535) -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.smin.i32(i32 [[INDUCTION1]], i32 65535) +; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.smin.i32(i32 [[INDUCTION]], i32 65535) +; CHECK-NEXT: [[TMP2:%.*]] = tail call i32 @llvm.smin.i32(i32 [[INDUCTION1]], i32 65535) ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDUCTION]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[INDUCTION1]] ; CHECK-NEXT: store i32 [[TMP1]], ptr [[TMP3]], align 8