diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -47,6 +47,8 @@ UMin, ///< Unisgned integer min implemented in terms of select(cmp()). UMax, ///< Unsigned integer max implemented in terms of select(cmp()). FAdd, ///< Sum of floats. + FAddAsInt, ///< Sum of floats that can be performed on integers followed by + ///< clamping to the largest representable integer. FMul, ///< Product of floats. FMin, ///< FP min implemented in terms of select(cmp()). FMax, ///< FP max implemented in terms of select(cmp()). diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -231,6 +231,15 @@ // preheader. Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()); + if (Kind == RecurKind::FAddAsInt) { + if (!RdxStart->getType()->isFloatTy() && !RdxStart->getType()->isDoubleTy()) + return false; + + auto *C = dyn_cast(RdxStart); + if (!C || !C->getValue().isInteger()) + return false; + } + // ExitInstruction is the single value which is used outside the loop. // We only allow for a single reduction value to be used outside the loop. // This includes users of the reduction, variables (which form a cycle @@ -504,6 +513,10 @@ // The ExitInstruction(Instruction which is allowed to have out-of-loop users) // is saved as part of the RecurrenceDescriptor. + if (Kind == RecurKind::FAddAsInt) { + RecurrenceType = IntegerType::get(RecurrenceType->getContext(), + RecurrenceType->getScalarSizeInBits()); + } // Save the description of this reduction variable. RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF, ReduxDesc.getExactFPMathInst(), RecurrenceType, @@ -657,6 +670,9 @@ I1->isFast()) return InstDesc(Kind == RecurKind::FAdd, SI); + if (m_FAdd(m_Value(Op1), m_Value(Op2)).match(I1)) + return InstDesc(Kind == RecurKind::FAddAsInt, SI); + if (m_FMul(m_Value(Op1), m_Value(Op2)).match(I1) && (I1->isFast())) return InstDesc(Kind == RecurKind::FMul, SI); @@ -690,10 +706,27 @@ I->hasAllowReassoc() ? nullptr : I); case Instruction::FSub: case Instruction::FAdd: + if (Kind == RecurKind::FAddAsInt) { + // FAdd-to-int conversion only supports a single fadd in the chain at the + // moment. + if (Prev.isRecurrence()) + return InstDesc(false, nullptr); + if (auto *C = dyn_cast(I->getOperand(1))) { + auto F = C->getValue(); + if (F == APFloat(F.getSemantics(), 1) || + F == APFloat(F.getSemantics(), 2) || + F == APFloat(F.getSemantics(), 4) || + F == APFloat(F.getSemantics(), 8) || + F == APFloat(F.getSemantics(), 16) || + F == APFloat(F.getSemantics(), 32)) + return InstDesc(true, I, I); + } + } return InstDesc(Kind == RecurKind::FAdd, I, I->hasAllowReassoc() ? nullptr : I); case Instruction::Select: - if (Kind == RecurKind::FAdd || Kind == RecurKind::FMul) + if (Kind == RecurKind::FAdd || Kind == RecurKind::FAddAsInt || + Kind == RecurKind::FMul) return isConditionalRdxPattern(Kind, I); LLVM_FALLTHROUGH; case Instruction::FCmp: @@ -783,6 +816,11 @@ LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n"); return true; } + if (AddReductionVar(Phi, RecurKind::FAddAsInt, TheLoop, FMF, RedDes, DB, AC, + DT)) { + LLVM_DEBUG(dbgs() << "Found an FAddAsInt reduction PHI." << *Phi << "\n"); + return true; + } if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, FMF, RedDes, DB, AC, DT)) { LLVM_DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n"); return true; @@ -912,6 +950,7 @@ switch (K) { case RecurKind::Xor: case RecurKind::Add: + case RecurKind::FAddAsInt: case RecurKind::Or: // Adding, Xoring, Oring zero to a number does not change it. return ConstantInt::get(Tp, 0); @@ -960,6 +999,7 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) { switch (Kind) { case RecurKind::Add: + case RecurKind::FAddAsInt: return Instruction::Add; case RecurKind::Mul: return Instruction::Mul; diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1041,6 +1041,7 @@ auto *SrcVecEltTy = cast(Src->getType())->getElementType(); switch (RdxKind) { case RecurKind::Add: + case RecurKind::FAddAsInt: return Builder.CreateAddReduce(Src); case RecurKind::Mul: return Builder.CreateMulReduce(Src); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -913,20 +913,27 @@ // If the above is false, we have ExactFPMath & do not allow reordering. // If the EnableStrictReductions flag is set, first check if we have any // Exact FP induction vars, which we cannot vectorize. - if (!EnableStrictReductions || - any_of(getInductionVars(), [&](auto &Induction) -> bool { + if (any_of(getInductionVars(), [&](auto &Induction) -> bool { InductionDescriptor IndDesc = Induction.second; return IndDesc.getExactFPMathInst(); })) return false; + if (all_of(getReductionVars(), [&](auto &Reduction) -> bool { + const RecurrenceDescriptor &RdxDesc = Reduction.second; + return !RdxDesc.hasExactFPMath() || + RdxDesc.getRecurrenceKind() == RecurKind::FAddAsInt; + })) + return true; + // We can now only vectorize if all reductions with Exact FP math also // have the isOrdered flag set, which indicates that we can move the // reduction operations in-loop. - return (all_of(getReductionVars(), [&](auto &Reduction) -> bool { - const RecurrenceDescriptor &RdxDesc = Reduction.second; - return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered(); - })); + return EnableStrictReductions && + all_of(getReductionVars(), [&](auto &Reduction) -> bool { + const RecurrenceDescriptor &RdxDesc = Reduction.second; + return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered(); + }); } bool LoopVectorizationLegality::isInductionPhi(const Value *V) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4186,6 +4186,16 @@ } } +static VPRecipeBase *getReductionIncrementRecipe(VPReductionPHIRecipe *PhiR) { + SmallVector Users(PhiR->users()); + assert((Users.size() == 1 || Users.size() == 2) && + "reduction phi must have either 1 or 2 users"); + auto *R = cast(Users[0]); + if (isa(R)) + return cast(Users[1]); + return R; +} + void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State) { // This is the second phase of vectorizing first-order recurrences. An @@ -4424,12 +4434,35 @@ if (VF.isVector() && !PhiR->isInLoop()) { ReducedPartRdx = createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); - // If the reduction can be performed in a smaller type, we need to extend - // the reduction to the wider type before we branch to the original loop. - if (PhiTy != RdxDesc.getRecurrenceType()) - ReducedPartRdx = RdxDesc.isSigned() - ? Builder.CreateSExt(ReducedPartRdx, PhiTy) - : Builder.CreateZExt(ReducedPartRdx, PhiTy); + if (RK == RecurKind::FAddAsInt) { + // Convert back the integer result to a floating point number and clamp it + // to the correct maximum value. + + VPRecipeBase *R = getReductionIncrementRecipe(PhiR); + // Return the maximum integral value with 1.0 mantissa for float or + // double, depending on the bitwidth. + auto GetMaxVal = [](Type *T) -> uint64_t { + // Reduction on float type. + if (T->getScalarSizeInBits() == 32) + return 0x1.0p24; + + assert(T->getScalarSizeInBits() == 64); + return 0x1.0p53; + }; + auto *C = R->getOperand(1)->getLiveInIRValue(); + uint64_t Max = + GetMaxVal(C->getType()) * cast(C)->getSExtValue(); + ReducedPartRdx = Builder.CreateSIToFP(ReducedPartRdx, PhiTy); + ReducedPartRdx = + Builder.CreateMinNum(ReducedPartRdx, ConstantFP::get(PhiTy, Max)); + } else { + // If the reduction can be performed in a smaller type, we need to extend + // the reduction to the wider type before we branch to the original loop. + if (PhiTy != RdxDesc.getRecurrenceType()) + ReducedPartRdx = RdxDesc.isSigned() + ? Builder.CreateSExt(ReducedPartRdx, PhiTy) + : Builder.CreateZExt(ReducedPartRdx, PhiTy); + } } // Create a phi node that merges control-flow from the backedge-taken check @@ -9706,6 +9739,44 @@ Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); } } + + SmallVector ToRemove; + for (VPRecipeBase &P : Plan->getEntry()->getEntryBasicBlock()->phis()) { + auto *RedPhi = dyn_cast(&P); + if (!RedPhi || RedPhi->getRecurrenceDescriptor().getRecurrenceKind() != + RecurKind::FAddAsInt) + continue; + // Convert floating point reduction increment to an integer addition using a + // VPInstruction. + VPRecipeBase *R = getReductionIncrementRecipe(RedPhi); + SmallVector Ops(R->op_begin(), R->op_end()); + Value *In = Ops[1]->getLiveInIRValue(); + auto GetInt = [](Value *V) { + auto *C = cast(V); + auto F = C->getValue(); + if (F == APFloat(F.getSemantics(), 1)) + return 1; + if (F == APFloat(F.getSemantics(), 2)) + return 2; + if (F == APFloat(F.getSemantics(), 4)) + return 4; + if (F == APFloat(F.getSemantics(), 8)) + return 8; + if (F == APFloat(F.getSemantics(), 16)) + return 16; + if (F == APFloat(F.getSemantics(), 32)) + return 32; + llvm_unreachable("unexpected float constant"); + }; + Ops[1] = Plan->getOrAddVPValue(ConstantInt::get( + RedPhi->getRecurrenceDescriptor().getRecurrenceType(), GetInt(In))); + auto *IntAdd = new VPInstruction(Instruction::Add, Ops); + R->getVPSingleValue()->replaceAllUsesWith(IntAdd); + IntAdd->insertAfter(R); + ToRemove.push_back(R); + } + for (VPRecipeBase *R : ToRemove) + R->eraseFromParent(); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1323,8 +1323,10 @@ // stage #1: We create a new vector PHI node with no incoming edges. We'll use // this value when we vectorize all of the instructions that use the PHI. bool ScalarPHI = State.VF.isScalar() || IsInLoop; - Type *VecTy = - ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); + RecurKind RK = RdxDesc.getRecurrenceKind(); + Type *BaseTy = + RK == RecurKind::FAddAsInt ? RdxDesc.getRecurrenceType() : PN->getType(); + Type *VecTy = ScalarPHI ? BaseTy : VectorType::get(BaseTy, State.VF); BasicBlock *HeaderBB = State.CFG.PrevBB; assert(State.LI->getLoopFor(HeaderBB)->getHeader() == HeaderBB && @@ -1341,8 +1343,13 @@ VPValue *StartVPV = getStartValue(); Value *StartV = StartVPV->getLiveInIRValue(); + if (RK == RecurKind::FAddAsInt) { + IRBuilderBase::InsertPointGuard IPBuilder(Builder); + Builder.SetInsertPoint(State.CFG.VectorPreHeader->getTerminator()); + StartV = Builder.CreateFPToSI(StartV, RdxDesc.getRecurrenceType()); + } + Value *Iden = nullptr; - RecurKind RK = RdxDesc.getRecurrenceKind(); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) { // MinMax reduction have the start value as their identify. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fadd-reduction-as-int.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fadd-reduction-as-int.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/fadd-reduction-as-int.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fadd-reduction-as-int.ll @@ -5,28 +5,33 @@ define float @test_fadd_to_int_add_1(i64 %cnt) { ; CHECK-LABEL: @test_fadd_to_int_add_1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[CNT:%.*]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[CNT:%.*]], 8 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[CNT]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[CNT]], 8 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[CNT]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP0:%.*]] = fadd float [[VEC_PHI]], 1.000000e+00 -; CHECK-NEXT: [[TMP1]] = fadd float [[TMP0]], 1.000000e+00 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP2]] = add <4 x i32> [[VEC_PHI]], +; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[VEC_PHI1]], +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP5]] to float +; CHECK-NEXT: [[TMP7:%.*]] = call float @llvm.minnum.f32(float [[TMP6]], float 0x4170000000000000) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[CNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -36,7 +41,7 @@ ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[CNT]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RED_NEXT_LCSSA]] ; entry: @@ -54,6 +59,120 @@ ret float %red.next } +define float @test_fadd_to_int_add_1_iv_only_nuw(i64 %cnt) { +; CHECK-LABEL: @test_fadd_to_int_add_1_iv_only_nuw( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[CNT:%.*]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[CNT]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[CNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP2]] = add <4 x i32> [[VEC_PHI]], +; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[VEC_PHI1]], +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP5]] to float +; CHECK-NEXT: [[TMP7:%.*]] = call float @llvm.minnum.f32(float [[TMP6]], float 0x4170000000000000) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[CNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[RED_NEXT]] = fadd float [[RED]], 1.000000e+00 +; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[CNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret float [[RED_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %red = phi float [ 0.000000e+00, %entry ], [ %red.next, %loop ] + %red.next = fadd float %red, 1.000000e+00 + %iv.next = add nuw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %cnt + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret float %red.next +} + +define float @test_fadd_to_int_add_1_iv_only_nsw(i64 %cnt) { +; CHECK-LABEL: @test_fadd_to_int_add_1_iv_only_nsw( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[CNT:%.*]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[CNT]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[CNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP2]] = add <4 x i32> [[VEC_PHI]], +; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[VEC_PHI1]], +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP5]] to float +; CHECK-NEXT: [[TMP7:%.*]] = call float @llvm.minnum.f32(float [[TMP6]], float 0x4170000000000000) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[CNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[RED_NEXT]] = fadd float [[RED]], 1.000000e+00 +; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[CNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret float [[RED_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %red = phi float [ 0.000000e+00, %entry ], [ %red.next, %loop ] + %red.next = fadd float %red, 1.000000e+00 + %iv.next = add nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %cnt + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret float %red.next +} + define float @test_fadd_to_int_add_1_5(i64 %cnt) { ; CHECK-LABEL: @test_fadd_to_int_add_1_5( ; CHECK-NEXT: entry: @@ -72,7 +191,7 @@ ; CHECK-NEXT: [[TMP1]] = fadd float [[TMP0]], 1.500000e+00 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[CNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -86,7 +205,7 @@ ; CHECK-NEXT: [[RED_NEXT]] = fadd float [[RED]], 1.500000e+00 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[CNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RED_NEXT_LCSSA]] @@ -109,28 +228,33 @@ define float @test_fadd_to_int_add_1_start_negative(i64 %cnt) { ; CHECK-LABEL: @test_fadd_to_int_add_1_start_negative( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[CNT:%.*]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[CNT:%.*]], 8 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[CNT]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[CNT]], 8 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[CNT]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ -3.000000e+00, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP0:%.*]] = fadd float [[VEC_PHI]], 1.000000e+00 -; CHECK-NEXT: [[TMP1]] = fadd float [[TMP0]], 1.000000e+00 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP2]] = add <4 x i32> [[VEC_PHI]], +; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[VEC_PHI1]], +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP5]] to float +; CHECK-NEXT: [[TMP7:%.*]] = call float @llvm.minnum.f32(float [[TMP6]], float 0x4170000000000000) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[CNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -3.000000e+00, [[ENTRY]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -3.000000e+00, [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -138,9 +262,9 @@ ; CHECK-NEXT: [[RED_NEXT]] = fadd float [[RED]], 1.000000e+00 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[CNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RED_NEXT_LCSSA]] ; entry: @@ -176,7 +300,7 @@ ; CHECK-NEXT: [[TMP1]] = fadd float [[TMP0]], 1.000000e+00 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[CNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -190,7 +314,7 @@ ; CHECK-NEXT: [[RED_NEXT]] = fadd float [[RED]], 1.000000e+00 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[CNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RED_NEXT_LCSSA]] @@ -228,7 +352,7 @@ ; CHECK-NEXT: [[TMP1]] = fadd float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[CNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -242,7 +366,7 @@ ; CHECK-NEXT: [[RED_NEXT]] = fadd float [[RED]], -1.000000e+00 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[CNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RED_NEXT_LCSSA]] @@ -265,28 +389,33 @@ define double @test_fadd_to_int_add_1_double(i64 %cnt) { ; CHECK-LABEL: @test_fadd_to_int_add_1_double( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[CNT:%.*]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[CNT:%.*]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[CNT]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[CNT]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[CNT]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP0:%.*]] = fadd double [[VEC_PHI]], 1.000000e+00 -; CHECK-NEXT: [[TMP1]] = fadd double [[TMP0]], 1.000000e+00 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP2]] = add <2 x i64> [[VEC_PHI]], +; CHECK-NEXT: [[TMP3]] = add <2 x i64> [[VEC_PHI1]], +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; CHECK-NEXT: [[TMP6:%.*]] = sitofp i64 [[TMP5]] to double +; CHECK-NEXT: [[TMP7:%.*]] = call double @llvm.minnum.f64(double [[TMP6]], double 0x4340000000000000) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[CNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -294,9 +423,9 @@ ; CHECK-NEXT: [[RED_NEXT]] = fadd double [[RED]], 1.000000e+00 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[CNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi double [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi double [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret double [[RED_NEXT_LCSSA]] ; entry: @@ -317,28 +446,33 @@ define float @test_fadd_to_int_add_2(i64 %cnt) { ; CHECK-LABEL: @test_fadd_to_int_add_2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[CNT:%.*]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[CNT:%.*]], 8 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[CNT]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[CNT]], 8 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[CNT]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP0:%.*]] = fadd float [[VEC_PHI]], 2.000000e+00 -; CHECK-NEXT: [[TMP1]] = fadd float [[TMP0]], 2.000000e+00 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP2]] = add <4 x i32> [[VEC_PHI]], +; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[VEC_PHI1]], +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP5]] to float +; CHECK-NEXT: [[TMP7:%.*]] = call float @llvm.minnum.f32(float [[TMP6]], float 0x4180000000000000) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[CNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -346,9 +480,9 @@ ; CHECK-NEXT: [[RED_NEXT]] = fadd float [[RED]], 2.000000e+00 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[CNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RED_NEXT_LCSSA]] ; entry: @@ -384,7 +518,7 @@ ; CHECK-NEXT: [[TMP1]] = fadd float [[TMP0]], 0.000000e+00 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[CNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -398,7 +532,7 @@ ; CHECK-NEXT: [[RED_NEXT]] = fadd float [[RED]], 0.000000e+00 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[CNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RED_NEXT_LCSSA]] @@ -421,20 +555,59 @@ define float @test_fadd_to_int_with_select_add_1(i32* nocapture readonly %A, i64 %cnt) { ; CHECK-LABEL: @test_fadd_to_int_with_select_add_1( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[CNT:%.*]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[CNT]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[CNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD2]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i32> [[VEC_PHI]], +; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[VEC_PHI1]], +; CHECK-NEXT: [[TMP12]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP10]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP13]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP11]], <4 x i32> [[VEC_PHI1]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP13]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-NEXT: [[TMP16:%.*]] = sitofp i32 [[TMP15]] to float +; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float 0x4170000000000000) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[CNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IV]] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]] ; CHECK-NEXT: [[LV_A:%.*]] = load i32, i32* [[GEP_A]], align 4 ; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[LV_A]], 0 ; CHECK-NEXT: [[ADD:%.*]] = fadd float [[RED]], 1.000000e+00 ; CHECK-NEXT: [[RED_NEXT]] = select i1 [[CMP1]], float [[ADD]], float [[RED]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[CNT:%.*]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[CNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[LOOP]] ] +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RED_NEXT_LCSSA]] ; entry: @@ -459,20 +632,59 @@ define double @test_fadd_to_int_with_select_add_1_double(i32* nocapture readonly %A, i64 %cnt) { ; CHECK-LABEL: @test_fadd_to_int_with_select_add_1_double( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[CNT:%.*]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[CNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[CNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>* +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD2]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i64> [[VEC_PHI]], +; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i64> [[VEC_PHI1]], +; CHECK-NEXT: [[TMP12]] = select <2 x i1> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[VEC_PHI]] +; CHECK-NEXT: [[TMP13]] = select <2 x i1> [[TMP9]], <2 x i64> [[TMP11]], <2 x i64> [[VEC_PHI1]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP13]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; CHECK-NEXT: [[TMP16:%.*]] = sitofp i64 [[TMP15]] to double +; CHECK-NEXT: [[TMP17:%.*]] = call double @llvm.minnum.f64(double [[TMP16]], double 0x4340000000000000) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[CNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IV]] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]] ; CHECK-NEXT: [[LV_A:%.*]] = load i32, i32* [[GEP_A]], align 4 ; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[LV_A]], 0 ; CHECK-NEXT: [[ADD:%.*]] = fadd double [[RED]], 1.000000e+00 ; CHECK-NEXT: [[RED_NEXT]] = select i1 [[CMP1]], double [[ADD]], double [[RED]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[CNT:%.*]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[CNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi double [ [[RED_NEXT]], [[LOOP]] ] +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi double [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret double [[RED_NEXT_LCSSA]] ; entry: @@ -497,20 +709,59 @@ define float @test_fadd_to_int_with_select_add_2(i32* nocapture readonly %A, i64 %cnt) { ; CHECK-LABEL: @test_fadd_to_int_with_select_add_2( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[CNT:%.*]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[CNT]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[CNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD2]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i32> [[VEC_PHI]], +; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[VEC_PHI1]], +; CHECK-NEXT: [[TMP12]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP10]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP13]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP11]], <4 x i32> [[VEC_PHI1]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP13]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-NEXT: [[TMP16:%.*]] = sitofp i32 [[TMP15]] to float +; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float 0x4170000000000000) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[CNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IV]] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]] ; CHECK-NEXT: [[LV_A:%.*]] = load i32, i32* [[GEP_A]], align 4 ; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[LV_A]], 0 ; CHECK-NEXT: [[ADD:%.*]] = fadd float [[RED]], 1.000000e+00 ; CHECK-NEXT: [[RED_NEXT]] = select i1 [[CMP1]], float [[ADD]], float [[RED]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[CNT:%.*]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[CNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[LOOP]] ] +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RED_NEXT_LCSSA]] ; entry: