diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8754,11 +8754,21 @@ // When avoiding a runtime check, the active.lane.mask inside the loop // uses a modified trip count and the induction variable increment is // done after the active.lane.mask intrinsic is called. - auto *TCMinusVF = - new VPInstruction(VPInstruction::CalculateTripCountMinusVF, {TC}, DL); - VecPreheader->appendRecipe(TCMinusVF); + + auto *Sub = new VPInstruction(Instruction::Sub, + {TC, &Plan.getRuntimeVFxUF()}, DL); + auto *Cmp = new VPInstruction(VPInstruction::ICmpUGT, + {TC, &Plan.getRuntimeVFxUF()}, DL); + auto *Select = new VPInstruction( + Instruction::Select, + {Cmp, Sub, Plan.getVPValueOrAddLiveIn(ConstantInt::get(IdxTy, 0))}, + DL); + + VecPreheader->appendRecipe(Sub); + VecPreheader->appendRecipe(Cmp); + VecPreheader->appendRecipe(Select); IncrementValue = CanonicalIVPHI; - TripCount = TCMinusVF; + TripCount = Select; } else { // When the loop is guarded by a runtime overflow check for the loop // induction variable increment by VF, we can increment the value before diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -994,10 +994,10 @@ // values of a first-order recurrence. Not, ICmpULE, + ICmpUGT, SLPLoad, SLPStore, ActiveLaneMask, - CalculateTripCountMinusVF, CanonicalIVIncrement, // The next op is similar to the above, but instead increment the // canonical IV separately for each unrolled part. @@ -1108,7 +1108,6 @@ default: return false; case VPInstruction::ActiveLaneMask: - case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrement: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::BranchOnCount: diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -214,25 +214,25 @@ } Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) { - if (Def->isLiveIn()) - return Def->getLiveInIRValue(); - if (hasScalarValue(Def, Instance)) { return Data .PerPartScalars[Def][Instance.Part][Instance.Lane.mapToCacheIndex(VF)]; } - assert(hasVectorValue(Def, Instance.Part)); - auto *VecPart = Data.PerPartOutput[Def][Instance.Part]; - if (!VecPart->getType()->isVectorTy()) { - assert(Instance.Lane.isFirstLane() && "cannot get lane > 0 for scalar"); - return VecPart; + if (hasVectorValue(Def, Instance.Part)) { + auto *VecPart = Data.PerPartOutput[Def][Instance.Part]; + if (!VecPart->getType()->isVectorTy()) { + assert(Instance.Lane.isFirstLane() && "cannot get lane > 0 for scalar"); + return VecPart; + } + // TODO: Cache created scalar values. + Value *Lane = Instance.Lane.getAsRuntimeExpr(Builder, VF); + auto *Extract = Builder.CreateExtractElement(VecPart, Lane); + // set(Def, Extract, Instance); + return Extract; } - // TODO: Cache created scalar values. - Value *Lane = Instance.Lane.getAsRuntimeExpr(Builder, VF); - auto *Extract = Builder.CreateExtractElement(VecPart, Lane); - // set(Def, Extract, Instance); - return Extract; + + return Def->getLiveInIRValue(); } Value *VPTransformState::get(VPValue *Def, unsigned Part) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -246,8 +246,10 @@ Builder.SetCurrentDebugLocation(DL); if (Instruction::isBinaryOp(getOpcode())) { - Value *A = State.get(getOperand(0), Part); - Value *B = State.get(getOperand(1), Part); + Value *A = getParent()->getParent() ? State.get(getOperand(0), Part) + : State.get(getOperand(0), {0, 0}); + Value *B = getParent()->getParent() ? State.get(getOperand(1), Part) + : State.get(getOperand(1), {0, 0}); return Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); } @@ -261,10 +263,21 @@ Value *TC = State.get(getOperand(1), Part); return Builder.CreateICmpULE(IV, TC, Name); } + case VPInstruction::ICmpUGT: { + Value *A = getParent()->getParent() ? State.get(getOperand(0), Part) + : State.get(getOperand(0), {0, 0}); + Value *B = getParent()->getParent() ? State.get(getOperand(1), Part) + : State.get(getOperand(1), {0, 0}); + + return Builder.CreateICmpUGT(A, B, Name); + } case Instruction::Select: { - Value *Cond = State.get(getOperand(0), Part); - Value *Op1 = State.get(getOperand(1), Part); - Value *Op2 = State.get(getOperand(2), Part); + Value *Cond = getParent()->getParent() ? State.get(getOperand(0), Part) + : State.get(getOperand(0), {0, 0}); + Value *Op1 = getParent()->getParent() ? State.get(getOperand(1), Part) + : State.get(getOperand(1), {0, 0}); + Value *Op2 = getParent()->getParent() ? State.get(getOperand(2), Part) + : State.get(getOperand(2), {0, 0}); return Builder.CreateSelect(Cond, Op1, Op2, Name); } case VPInstruction::ActiveLaneMask: { @@ -300,15 +313,6 @@ Value *V2 = State.get(getOperand(1), Part); return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name); } - case VPInstruction::CalculateTripCountMinusVF: { - Value *ScalarTC = State.get(getOperand(0), {0, 0}); - Value *Step = - createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF); - Value *Sub = Builder.CreateSub(ScalarTC, Step); - Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); - Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); - return Builder.CreateSelect(Cmp, Sub, Zero); - } case VPInstruction::CanonicalIVIncrement: { if (Part == 0) { auto *Phi = State.get(getOperand(0), 0); @@ -425,6 +429,9 @@ case VPInstruction::ICmpULE: O << "icmp ule"; break; + case VPInstruction::ICmpUGT: + O << "icmp ugt"; + break; case VPInstruction::SLPLoad: O << "combined load"; break; @@ -443,9 +450,6 @@ case VPInstruction::BranchOnCond: O << "branch-on-cond"; break; - case VPInstruction::CalculateTripCountMinusVF: - O << "TC > VF ? TC - VF : 0"; - break; case VPInstruction::CanonicalIVIncrementForPart: O << "VF * Part +"; break; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -138,45 +138,43 @@ ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[TMP8]] -; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[N]], [[TMP8]] -; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP12]] -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP16]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP15]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP11]]) +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP10]] +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP13]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 -; CHECK-ORDERED-TF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-ORDERED-TF-NEXT: [[ADD]] = fadd float [[TMP19]], [[SUM_07]] +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[ADD]] = fadd float [[TMP17]], [[SUM_07]] ; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-ORDERED-TF: for.end: -; CHECK-ORDERED-TF-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: ret float [[ADD_LCSSA]] ; @@ -407,26 +405,18 @@ ; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 24 ; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP12]] -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = sub i64 [[N]], [[TMP14]] -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = icmp ugt i64 [[N]], [[TMP14]] -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i64 [[TMP15]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = sub i64 [[N]], [[TMP19]] -; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = icmp ugt i64 [[N]], [[TMP19]] -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i64 [[TMP20]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = sub i64 [[N]], [[TMP24]] -; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = icmp ugt i64 [[N]], [[TMP24]] -; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[TMP25]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = sub i64 [[N]], [[TMP29]] -; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = icmp ugt i64 [[N]], [[TMP29]] -; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i64 [[TMP30]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = select i1 [[TMP17]], i64 [[TMP13]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = select i1 [[TMP17]], i64 [[TMP13]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = select i1 [[TMP17]], i64 [[TMP13]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = select i1 [[TMP17]], i64 [[TMP13]], i64 0 ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) @@ -438,86 +428,86 @@ ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT12:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT13:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT14:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP70:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = add i64 [[TMP35]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP37:%.*]] = mul i64 [[TMP36]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = add i64 [[INDEX]], [[TMP37]] -; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = add i64 [[TMP40]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = add i64 [[INDEX]], [[TMP42]] -; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = add i64 [[TMP45]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = mul i64 [[TMP46]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = add i64 [[INDEX]], [[TMP47]] -; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP33]] -; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP38]] -; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP43]] -; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP48]] -; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP53]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = mul i64 [[TMP54]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i64 [[TMP55]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP56]], i32 4, [[ACTIVE_LANE_MASK6]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = mul i64 [[TMP57]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i64 [[TMP58]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP59]], i32 4, [[ACTIVE_LANE_MASK7]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = mul i64 [[TMP60]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i64 [[TMP61]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP62]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP63]]) -; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = select [[ACTIVE_LANE_MASK6]], [[WIDE_MASKED_LOAD9]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP64]], [[TMP65]]) -; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = select [[ACTIVE_LANE_MASK7]], [[WIDE_MASKED_LOAD10]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP68:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP66]], [[TMP67]]) -; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = select [[ACTIVE_LANE_MASK8]], [[WIDE_MASKED_LOAD11]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP70]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP68]], [[TMP69]]) -; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = mul i64 [[TMP71]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = add i64 [[INDEX]], [[TMP72]] -; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = mul i64 [[TMP74]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = add i64 [[INDEX]], [[TMP75]] -; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = mul i64 [[TMP77]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = add i64 [[INDEX]], [[TMP78]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP17]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT12]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP73]], i64 [[TMP22]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP76]], i64 [[TMP27]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP79]], i64 [[TMP32]]) +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP62:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = add i64 [[TMP27]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], [[TMP29]] +; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = add i64 [[TMP32]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = add i64 [[INDEX]], [[TMP34]] +; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP37:%.*]] = mul i64 [[TMP36]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = add i64 [[TMP37]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = mul i64 [[TMP38]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = add i64 [[INDEX]], [[TMP39]] +; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP25]] +; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP30]] +; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP35]] +; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP40]] +; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, ptr [[TMP41]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP45]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = mul i64 [[TMP46]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, ptr [[TMP41]], i64 [[TMP47]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP48]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = mul i64 [[TMP49]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, ptr [[TMP41]], i64 [[TMP50]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, ptr [[TMP41]], i64 [[TMP53]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP55]]) +; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = select [[ACTIVE_LANE_MASK6]], [[WIDE_MASKED_LOAD9]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP56]], [[TMP57]]) +; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = select [[ACTIVE_LANE_MASK7]], [[WIDE_MASKED_LOAD10]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP58]], [[TMP59]]) +; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = select [[ACTIVE_LANE_MASK8]], [[WIDE_MASKED_LOAD11]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP62]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP60]], [[TMP61]]) +; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = mul i64 [[TMP63]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = add i64 [[INDEX]], [[TMP64]] +; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = mul i64 [[TMP66]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP68:%.*]] = add i64 [[INDEX]], [[TMP67]] +; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = add i64 [[INDEX]], [[TMP70]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP21]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT12]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP65]], i64 [[TMP22]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP68]], i64 [[TMP23]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP71]], i64 [[TMP24]]) ; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP80:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP83:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP84:%.*]] = extractelement [[TMP80]], i32 0 -; CHECK-ORDERED-TF-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = extractelement [[TMP72]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP76]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP70]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP62]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-ORDERED-TF-NEXT: [[ADD]] = fadd float [[TMP85]], [[SUM_07]] +; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[ADD]] = fadd float [[TMP77]], [[SUM_07]] ; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-ORDERED-TF: for.end: -; CHECK-ORDERED-TF-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP70]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP62]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: ret float [[ADD_LCSSA]] ; @@ -732,60 +722,58 @@ ; CHECK-ORDERED-TF-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2 ; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 -; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 -; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = sub i64 [[TMP2]], [[TMP11]] -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[TMP2]], [[TMP11]] -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]] +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[TMP2]], [[TMP9]] +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0 ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]]) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP15]] -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 ; CHECK-ORDERED-TF-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.experimental.vector.interleave2.nxv8i1( [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK]]) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP17]], i32 4, [[INTERLEAVED_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP15]], i32 4, [[INTERLEAVED_MASK]], poison) ; CHECK-ORDERED-TF-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8f32( [[WIDE_MASKED_VEC]]) -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP19]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP20]]) -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP23]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], [[TMP22]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP14]]) +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP17]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP19]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP18]]) +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP16]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], [[TMP20]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]]) ; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] -; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = extractelement [[TMP24]], i32 0 -; CHECK-ORDERED-TF-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = extractelement [[TMP22]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ADD_PHI2:%.*]] = phi float [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDXB1]], align 4 -; CHECK-ORDERED-TF-NEXT: [[ADD1]] = fadd float [[TMP26]], [[ADD_PHI2]] +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = load float, ptr [[ARRAYIDXB1]], align 4 +; CHECK-ORDERED-TF-NEXT: [[ADD1]] = fadd float [[TMP24]], [[ADD_PHI2]] ; CHECK-ORDERED-TF-NEXT: [[OR:%.*]] = or i64 [[IV]], 1 ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[OR]] -; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = load float, ptr [[ARRAYIDXB2]], align 4 -; CHECK-ORDERED-TF-NEXT: [[ADD2]] = fadd float [[TMP27]], [[ADD_PHI1]] +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = load float, ptr [[ARRAYIDXB2]], align 4 +; CHECK-ORDERED-TF-NEXT: [[ADD2]] = fadd float [[TMP25]], [[ADD_PHI1]] ; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2 ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-ORDERED-TF: for.end: -; CHECK-ORDERED-TF-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] -; CHECK-ORDERED-TF-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: store float [[ADD1_LCSSA]], ptr [[A]], align 4 ; CHECK-ORDERED-TF-NEXT: store float [[ADD2_LCSSA]], ptr [[ARRAYIDXA]], align 4 ; CHECK-ORDERED-TF-NEXT: ret void @@ -993,52 +981,50 @@ ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 -; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 -; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[TMP9]] -; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[N]], [[TMP9]] -; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = sub i64 [[N]], [[TMP7]] +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[N]], [[TMP7]] +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0 ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]] +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]] ; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP15]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP17]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = fadd [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP20]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP19]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]]) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP15]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = fadd [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP16]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP18]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP17]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP10]]) ; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] -; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = extractelement [[TMP21]], i32 0 -; CHECK-ORDERED-TF-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = extractelement [[TMP19]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-ORDERED-TF-NEXT: [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 -; CHECK-ORDERED-TF-NEXT: [[ADD:%.*]] = fadd float [[TMP23]], [[TMP24]] +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 +; CHECK-ORDERED-TF-NEXT: [[ADD:%.*]] = fadd float [[TMP21]], [[TMP22]] ; CHECK-ORDERED-TF-NEXT: [[RDX]] = fadd float [[RES_014]], [[ADD]] ; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK-ORDERED-TF: for.end.loopexit: -; CHECK-ORDERED-TF-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_END]] ; CHECK-ORDERED-TF: for.end: ; CHECK-ORDERED-TF-NEXT: [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ] @@ -1234,62 +1220,60 @@ ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 -; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[TMP8]] -; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[N]], [[TMP8]] -; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP12]] -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = fcmp une [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP12]] -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP15]], zeroinitializer -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = getelementptr float, ptr [[TMP16]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP18]], i32 4, [[TMP17]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = xor [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP19]], zeroinitializer -; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select [[TMP20]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD1]] -; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = or [[TMP17]], [[TMP20]] -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = select [[TMP21]], [[PREDPHI]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP23]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP22]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP11]]) +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP10]] +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = fcmp une [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP10]] +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = getelementptr float, ptr [[TMP14]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP16]], i32 4, [[TMP15]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP17]], zeroinitializer +; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select [[TMP18]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD1]] +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = or [[TMP15]], [[TMP18]] +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select [[TMP19]], [[PREDPHI]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP20]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = extractelement [[TMP24]], i32 0 -; CHECK-ORDERED-TF-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = extractelement [[TMP22]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; CHECK-ORDERED-TF-NEXT: [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ] ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-ORDERED-TF-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP26]], 0.000000e+00 +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP24]], 0.000000e+00 ; CHECK-ORDERED-TF-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; CHECK-ORDERED-TF: if.then: ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 ; CHECK-ORDERED-TF-NEXT: br label [[FOR_INC]] ; CHECK-ORDERED-TF: for.inc: -; CHECK-ORDERED-TF-NEXT: [[PHI:%.*]] = phi float [ [[TMP27]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[PHI:%.*]] = phi float [ [[TMP25]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[FADD]] = fadd float [[RES]], [[PHI]] ; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-ORDERED-TF: for.end: -; CHECK-ORDERED-TF-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: ret float [[RDX]] ; @@ -1719,26 +1703,18 @@ ; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 24 ; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP12]] -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = sub i64 [[N]], [[TMP14]] -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = icmp ugt i64 [[N]], [[TMP14]] -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i64 [[TMP15]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = sub i64 [[N]], [[TMP19]] -; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = icmp ugt i64 [[N]], [[TMP19]] -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i64 [[TMP20]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = sub i64 [[N]], [[TMP24]] -; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = icmp ugt i64 [[N]], [[TMP24]] -; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[TMP25]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = sub i64 [[N]], [[TMP29]] -; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = icmp ugt i64 [[N]], [[TMP29]] -; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i64 [[TMP30]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = select i1 [[TMP17]], i64 [[TMP13]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = select i1 [[TMP17]], i64 [[TMP13]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = select i1 [[TMP17]], i64 [[TMP13]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = select i1 [[TMP17]], i64 [[TMP13]], i64 0 ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) @@ -1750,110 +1726,110 @@ ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP88:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = add i64 [[TMP35]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP37:%.*]] = mul i64 [[TMP36]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = add i64 [[INDEX]], [[TMP37]] -; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = add i64 [[TMP40]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = add i64 [[INDEX]], [[TMP42]] -; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = add i64 [[TMP45]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = mul i64 [[TMP46]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = add i64 [[INDEX]], [[TMP47]] -; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP33]] -; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP38]] -; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP43]] -; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP48]] -; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP53]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = mul i64 [[TMP54]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i64 [[TMP55]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP56]], i32 4, [[ACTIVE_LANE_MASK6]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = mul i64 [[TMP57]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i64 [[TMP58]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP59]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = add i64 [[TMP27]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], [[TMP29]] +; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = add i64 [[TMP32]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = add i64 [[INDEX]], [[TMP34]] +; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP37:%.*]] = mul i64 [[TMP36]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = add i64 [[TMP37]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = mul i64 [[TMP38]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = add i64 [[INDEX]], [[TMP39]] +; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP25]] +; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP30]] +; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP35]] +; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP40]] +; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, ptr [[TMP41]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP45]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = mul i64 [[TMP46]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, ptr [[TMP41]], i64 [[TMP47]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP48]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = mul i64 [[TMP49]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, ptr [[TMP41]], i64 [[TMP50]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, ptr [[TMP41]], i64 [[TMP53]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP25]] +; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP30]] +; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP35]] +; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP40]] +; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, ptr [[TMP55]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP59]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = mul i64 [[TMP60]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i64 [[TMP61]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP62]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP33]] -; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP38]] -; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP43]] -; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP48]] -; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = getelementptr inbounds float, ptr [[TMP63]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP67]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP68:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = mul i64 [[TMP68]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = getelementptr inbounds float, ptr [[TMP63]], i64 [[TMP69]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP70]], i32 4, [[ACTIVE_LANE_MASK6]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = mul i64 [[TMP71]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = getelementptr inbounds float, ptr [[TMP63]], i64 [[TMP72]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP73]], i32 4, [[ACTIVE_LANE_MASK7]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = mul i64 [[TMP74]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = getelementptr inbounds float, ptr [[TMP63]], i64 [[TMP75]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP76]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = fmul [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]] -; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = fmul [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]] -; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = fmul [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]] -; CHECK-ORDERED-TF-NEXT: [[TMP80:%.*]] = fmul [[WIDE_MASKED_LOAD11]], [[WIDE_MASKED_LOAD15]] -; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP77]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP81]]) -; CHECK-ORDERED-TF-NEXT: [[TMP83:%.*]] = select [[ACTIVE_LANE_MASK6]], [[TMP78]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP84:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP82]], [[TMP83]]) -; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = select [[ACTIVE_LANE_MASK7]], [[TMP79]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP86:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP84]], [[TMP85]]) -; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = select [[ACTIVE_LANE_MASK8]], [[TMP80]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP88]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP86]], [[TMP87]]) -; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP90:%.*]] = mul i64 [[TMP89]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP91:%.*]] = add i64 [[INDEX]], [[TMP90]] -; CHECK-ORDERED-TF-NEXT: [[TMP92:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP93:%.*]] = mul i64 [[TMP92]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP94:%.*]] = add i64 [[INDEX]], [[TMP93]] -; CHECK-ORDERED-TF-NEXT: [[TMP95:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP96:%.*]] = mul i64 [[TMP95]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP97:%.*]] = add i64 [[INDEX]], [[TMP96]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP17]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP91]], i64 [[TMP22]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP94]], i64 [[TMP27]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP97]], i64 [[TMP32]]) +; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = mul i64 [[TMP60]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP55]], i64 [[TMP61]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP62]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = mul i64 [[TMP63]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP55]], i64 [[TMP64]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP65]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = mul i64 [[TMP66]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP55]], i64 [[TMP67]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP68]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = fmul [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]] +; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = fmul [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]] +; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = fmul [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]] +; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = fmul [[WIDE_MASKED_LOAD11]], [[WIDE_MASKED_LOAD15]] +; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP69]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP73]]) +; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = select [[ACTIVE_LANE_MASK6]], [[TMP70]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP74]], [[TMP75]]) +; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = select [[ACTIVE_LANE_MASK7]], [[TMP71]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP76]], [[TMP77]]) +; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = select [[ACTIVE_LANE_MASK8]], [[TMP72]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP80]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP78]], [[TMP79]]) +; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = mul i64 [[TMP81]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP83:%.*]] = add i64 [[INDEX]], [[TMP82]] +; CHECK-ORDERED-TF-NEXT: [[TMP84:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = mul i64 [[TMP84]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP86:%.*]] = add i64 [[INDEX]], [[TMP85]] +; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = add i64 [[INDEX]], [[TMP88]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP21]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP83]], i64 [[TMP22]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP86]], i64 [[TMP23]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP89]], i64 [[TMP24]]) ; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP98:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP99:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP100:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP101:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP102:%.*]] = extractelement [[TMP98]], i32 0 -; CHECK-ORDERED-TF-NEXT: br i1 [[TMP102]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-ORDERED-TF-NEXT: [[TMP90:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP91:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP92:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP93:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP94:%.*]] = extractelement [[TMP90]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP94]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP88]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP80]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP103:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[TMP95:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP104:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; CHECK-ORDERED-TF-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP103]], float [[TMP104]], float [[SUM_07]]) +; CHECK-ORDERED-TF-NEXT: [[TMP96:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-TF-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP95]], float [[TMP96]], float [[SUM_07]]) ; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-ORDERED-TF: for.end: -; CHECK-ORDERED-TF-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP88]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP80]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: ret float [[MULADD_LCSSA]] ; @@ -2133,26 +2109,18 @@ ; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 24 ; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP12]] -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = sub i64 [[N]], [[TMP14]] -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = icmp ugt i64 [[N]], [[TMP14]] -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i64 [[TMP15]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = sub i64 [[N]], [[TMP19]] -; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = icmp ugt i64 [[N]], [[TMP19]] -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i64 [[TMP20]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = sub i64 [[N]], [[TMP24]] -; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = icmp ugt i64 [[N]], [[TMP24]] -; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[TMP25]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = sub i64 [[N]], [[TMP29]] -; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = icmp ugt i64 [[N]], [[TMP29]] -; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i64 [[TMP30]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = select i1 [[TMP17]], i64 [[TMP13]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = select i1 [[TMP17]], i64 [[TMP13]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = select i1 [[TMP17]], i64 [[TMP13]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = select i1 [[TMP17]], i64 [[TMP13]], i64 0 ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) @@ -2164,110 +2132,110 @@ ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP88:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = add i64 [[TMP35]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP37:%.*]] = mul i64 [[TMP36]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = add i64 [[INDEX]], [[TMP37]] -; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = add i64 [[TMP40]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = add i64 [[INDEX]], [[TMP42]] -; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = add i64 [[TMP45]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = mul i64 [[TMP46]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = add i64 [[INDEX]], [[TMP47]] -; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP33]] -; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP38]] -; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP43]] -; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP48]] -; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP53]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = mul i64 [[TMP54]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i64 [[TMP55]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP56]], i32 4, [[ACTIVE_LANE_MASK6]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = mul i64 [[TMP57]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i64 [[TMP58]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP59]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = add i64 [[TMP27]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], [[TMP29]] +; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = add i64 [[TMP32]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = add i64 [[INDEX]], [[TMP34]] +; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP37:%.*]] = mul i64 [[TMP36]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = add i64 [[TMP37]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = mul i64 [[TMP38]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = add i64 [[INDEX]], [[TMP39]] +; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP25]] +; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP30]] +; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP35]] +; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP40]] +; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, ptr [[TMP41]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP45]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = mul i64 [[TMP46]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, ptr [[TMP41]], i64 [[TMP47]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP48]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = mul i64 [[TMP49]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, ptr [[TMP41]], i64 [[TMP50]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, ptr [[TMP41]], i64 [[TMP53]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP25]] +; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP30]] +; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP35]] +; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP40]] +; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, ptr [[TMP55]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP59]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = mul i64 [[TMP60]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i64 [[TMP61]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP62]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP33]] -; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP38]] -; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP43]] -; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP48]] -; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = getelementptr inbounds float, ptr [[TMP63]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP67]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP68:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = mul i64 [[TMP68]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = getelementptr inbounds float, ptr [[TMP63]], i64 [[TMP69]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP70]], i32 4, [[ACTIVE_LANE_MASK6]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = mul i64 [[TMP71]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = getelementptr inbounds float, ptr [[TMP63]], i64 [[TMP72]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP73]], i32 4, [[ACTIVE_LANE_MASK7]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = mul i64 [[TMP74]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = getelementptr inbounds float, ptr [[TMP63]], i64 [[TMP75]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP76]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = fmul nnan [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]] -; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = fmul nnan [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]] -; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = fmul nnan [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]] -; CHECK-ORDERED-TF-NEXT: [[TMP80:%.*]] = fmul nnan [[WIDE_MASKED_LOAD11]], [[WIDE_MASKED_LOAD15]] -; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = select nnan [[ACTIVE_LANE_MASK]], [[TMP77]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP81]]) -; CHECK-ORDERED-TF-NEXT: [[TMP83:%.*]] = select nnan [[ACTIVE_LANE_MASK6]], [[TMP78]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP84:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP82]], [[TMP83]]) -; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = select nnan [[ACTIVE_LANE_MASK7]], [[TMP79]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP86:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP84]], [[TMP85]]) -; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = select nnan [[ACTIVE_LANE_MASK8]], [[TMP80]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP88]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP86]], [[TMP87]]) -; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP90:%.*]] = mul i64 [[TMP89]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP91:%.*]] = add i64 [[INDEX]], [[TMP90]] -; CHECK-ORDERED-TF-NEXT: [[TMP92:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP93:%.*]] = mul i64 [[TMP92]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP94:%.*]] = add i64 [[INDEX]], [[TMP93]] -; CHECK-ORDERED-TF-NEXT: [[TMP95:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP96:%.*]] = mul i64 [[TMP95]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP97:%.*]] = add i64 [[INDEX]], [[TMP96]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP17]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP91]], i64 [[TMP22]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP94]], i64 [[TMP27]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP97]], i64 [[TMP32]]) +; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = mul i64 [[TMP60]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP55]], i64 [[TMP61]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP62]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = mul i64 [[TMP63]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP55]], i64 [[TMP64]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP65]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = mul i64 [[TMP66]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP55]], i64 [[TMP67]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP68]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = fmul nnan [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]] +; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = fmul nnan [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]] +; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = fmul nnan [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]] +; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = fmul nnan [[WIDE_MASKED_LOAD11]], [[WIDE_MASKED_LOAD15]] +; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = select nnan [[ACTIVE_LANE_MASK]], [[TMP69]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP73]]) +; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = select nnan [[ACTIVE_LANE_MASK6]], [[TMP70]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP74]], [[TMP75]]) +; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = select nnan [[ACTIVE_LANE_MASK7]], [[TMP71]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP76]], [[TMP77]]) +; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = select nnan [[ACTIVE_LANE_MASK8]], [[TMP72]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP80]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP78]], [[TMP79]]) +; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = mul i64 [[TMP81]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP83:%.*]] = add i64 [[INDEX]], [[TMP82]] +; CHECK-ORDERED-TF-NEXT: [[TMP84:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = mul i64 [[TMP84]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP86:%.*]] = add i64 [[INDEX]], [[TMP85]] +; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = add i64 [[INDEX]], [[TMP88]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP21]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP83]], i64 [[TMP22]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP86]], i64 [[TMP23]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP89]], i64 [[TMP24]]) ; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP98:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP99:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP100:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP101:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP102:%.*]] = extractelement [[TMP98]], i32 0 -; CHECK-ORDERED-TF-NEXT: br i1 [[TMP102]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-ORDERED-TF-NEXT: [[TMP90:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP91:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP92:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP93:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP94:%.*]] = extractelement [[TMP90]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP94]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP88]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP80]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP103:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[TMP95:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP104:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; CHECK-ORDERED-TF-NEXT: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP103]], float [[TMP104]], float [[SUM_07]]) +; CHECK-ORDERED-TF-NEXT: [[TMP96:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-TF-NEXT: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP95]], float [[TMP96]], float [[SUM_07]]) ; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK-ORDERED-TF: for.end: -; CHECK-ORDERED-TF-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP88]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP80]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: ret float [[MULADD_LCSSA]] ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll @@ -17,8 +17,10 @@ ; VPLANS-NEXT: No successors ; VPLANS-EMPTY: ; VPLANS-NEXT: vector.ph: -; VPLANS-NEXT: EMIT vp<[[VF:%[0-9]+]]> = VF * Part + ir<0> -; VPLANS-NEXT: EMIT vp<[[NEWTC:%[0-9]+]]> = TC > VF ? TC - VF : 0 vp<[[TC]]> +; VPLANS-NEXT: EMIT vp<[[VF:%[0-9]+]]> = VF * Part + ir<0> +; VPLANS-NEXT: EMIT vp<[[SUB:%[0-9]+]]> = sub vp<[[TC]]>, +; VPLANS-NEXT: EMIT vp<[[CMP:%[0-9]+]]> = icmp ugt vp<[[TC]]>, +; VPLANS-NEXT: EMIT vp<[[NEWTC:%[0-9]+]]> = select vp<[[CMP]]>, vp<[[SUB]]>, ir<0> ; VPLANS-NEXT: EMIT vp<[[LANEMASK_ENTRY:%[0-9]+]]> = active lane mask vp<[[VF]]>, vp<[[TC]]> ; VPLANS-NEXT: Successor(s): vector loop ; VPLANS-EMPTY: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll @@ -29,26 +29,18 @@ ; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 12 ; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 16 -; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[UMAX]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp ugt i64 [[UMAX]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i64 [[TMP15]], i64 0 -; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 16 -; CHECK-NEXT: [[TMP20:%.*]] = sub i64 [[UMAX]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = icmp ugt i64 [[UMAX]], [[TMP19]] -; CHECK-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i64 [[TMP20]], i64 0 -; CHECK-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 16 -; CHECK-NEXT: [[TMP25:%.*]] = sub i64 [[UMAX]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = icmp ugt i64 [[UMAX]], [[TMP24]] -; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[TMP25]], i64 0 -; CHECK-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 16 -; CHECK-NEXT: [[TMP30:%.*]] = sub i64 [[UMAX]], [[TMP29]] -; CHECK-NEXT: [[TMP31:%.*]] = icmp ugt i64 [[UMAX]], [[TMP29]] -; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i64 [[TMP30]], i64 0 +; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP14:%.*]] = sub i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP16:%.*]] = sub i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP17]], i64 [[TMP13]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = select i1 [[TMP17]], i64 [[TMP13]], i64 0 +; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP17]], i64 [[TMP13]], i64 0 +; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP17]], i64 [[TMP13]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]]) @@ -62,60 +54,60 @@ ; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT10:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT11:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT12:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[INDEX6]], 0 -; CHECK-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 4 -; CHECK-NEXT: [[TMP36:%.*]] = add i64 [[TMP35]], 0 -; CHECK-NEXT: [[TMP37:%.*]] = mul i64 [[TMP36]], 1 -; CHECK-NEXT: [[TMP38:%.*]] = add i64 [[INDEX6]], [[TMP37]] -; CHECK-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 8 -; CHECK-NEXT: [[TMP41:%.*]] = add i64 [[TMP40]], 0 -; CHECK-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 1 -; CHECK-NEXT: [[TMP43:%.*]] = add i64 [[INDEX6]], [[TMP42]] -; CHECK-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 12 -; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[TMP45]], 0 -; CHECK-NEXT: [[TMP47:%.*]] = mul i64 [[TMP46]], 1 -; CHECK-NEXT: [[TMP48:%.*]] = add i64 [[INDEX6]], [[TMP47]] -; CHECK-NEXT: [[TMP49:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP33]] -; CHECK-NEXT: [[TMP50:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP38]] -; CHECK-NEXT: [[TMP51:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP43]] -; CHECK-NEXT: [[TMP52:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP48]] -; CHECK-NEXT: [[TMP53:%.*]] = getelementptr i32, ptr [[TMP49]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP53]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP54:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP54]], 4 -; CHECK-NEXT: [[TMP56:%.*]] = getelementptr i32, ptr [[TMP49]], i64 [[TMP55]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP56]], i32 4, [[ACTIVE_LANE_MASK7]]) -; CHECK-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP58:%.*]] = mul i64 [[TMP57]], 8 -; CHECK-NEXT: [[TMP59:%.*]] = getelementptr i32, ptr [[TMP49]], i64 [[TMP58]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP59]], i32 4, [[ACTIVE_LANE_MASK8]]) -; CHECK-NEXT: [[TMP60:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP61:%.*]] = mul i64 [[TMP60]], 12 -; CHECK-NEXT: [[TMP62:%.*]] = getelementptr i32, ptr [[TMP49]], i64 [[TMP61]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP62]], i32 4, [[ACTIVE_LANE_MASK9]]) -; CHECK-NEXT: [[TMP63:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP64:%.*]] = mul i64 [[TMP63]], 4 -; CHECK-NEXT: [[TMP65:%.*]] = add i64 [[INDEX6]], [[TMP64]] -; CHECK-NEXT: [[TMP66:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP67:%.*]] = mul i64 [[TMP66]], 8 -; CHECK-NEXT: [[TMP68:%.*]] = add i64 [[INDEX6]], [[TMP67]] -; CHECK-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 12 -; CHECK-NEXT: [[TMP71:%.*]] = add i64 [[INDEX6]], [[TMP70]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP17]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT10]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP65]], i64 [[TMP22]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT11]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP68]], i64 [[TMP27]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT12]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP71]], i64 [[TMP32]]) +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX6]], 0 +; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 4 +; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP27]], 0 +; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 1 +; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[INDEX6]], [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 8 +; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[TMP32]], 0 +; CHECK-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], 1 +; CHECK-NEXT: [[TMP35:%.*]] = add i64 [[INDEX6]], [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP37:%.*]] = mul i64 [[TMP36]], 12 +; CHECK-NEXT: [[TMP38:%.*]] = add i64 [[TMP37]], 0 +; CHECK-NEXT: [[TMP39:%.*]] = mul i64 [[TMP38]], 1 +; CHECK-NEXT: [[TMP40:%.*]] = add i64 [[INDEX6]], [[TMP39]] +; CHECK-NEXT: [[TMP41:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP42:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP35]] +; CHECK-NEXT: [[TMP44:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP40]] +; CHECK-NEXT: [[TMP45:%.*]] = getelementptr i32, ptr [[TMP41]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP45]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP46:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP47:%.*]] = mul i64 [[TMP46]], 4 +; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i32, ptr [[TMP41]], i64 [[TMP47]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP48]], i32 4, [[ACTIVE_LANE_MASK7]]) +; CHECK-NEXT: [[TMP49:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP49]], 8 +; CHECK-NEXT: [[TMP51:%.*]] = getelementptr i32, ptr [[TMP41]], i64 [[TMP50]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK8]]) +; CHECK-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 12 +; CHECK-NEXT: [[TMP54:%.*]] = getelementptr i32, ptr [[TMP41]], i64 [[TMP53]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK9]]) +; CHECK-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 4 +; CHECK-NEXT: [[TMP57:%.*]] = add i64 [[INDEX6]], [[TMP56]] +; CHECK-NEXT: [[TMP58:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP58]], 8 +; CHECK-NEXT: [[TMP60:%.*]] = add i64 [[INDEX6]], [[TMP59]] +; CHECK-NEXT: [[TMP61:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP61]], 12 +; CHECK-NEXT: [[TMP63:%.*]] = add i64 [[INDEX6]], [[TMP62]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP21]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT10]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP57]], i64 [[TMP22]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT11]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP60]], i64 [[TMP23]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT12]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP63]], i64 [[TMP24]]) ; CHECK-NEXT: [[INDEX_NEXT13]] = add i64 [[INDEX6]], [[TMP6]] -; CHECK-NEXT: [[TMP72:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP73:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT10]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP74:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP75:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP76:%.*]] = extractelement [[TMP72]], i32 0 -; CHECK-NEXT: br i1 [[TMP76]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP64:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP65:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT10]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP66:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP67:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP68:%.*]] = extractelement [[TMP64]], i32 0 +; CHECK-NEXT: br i1 [[TMP68]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -171,26 +163,18 @@ ; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 12 ; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 16 -; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[UMAX]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp ugt i64 [[UMAX]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i64 [[TMP15]], i64 0 -; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 16 -; CHECK-NEXT: [[TMP20:%.*]] = sub i64 [[UMAX]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = icmp ugt i64 [[UMAX]], [[TMP19]] -; CHECK-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i64 [[TMP20]], i64 0 -; CHECK-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 16 -; CHECK-NEXT: [[TMP25:%.*]] = sub i64 [[UMAX]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = icmp ugt i64 [[UMAX]], [[TMP24]] -; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[TMP25]], i64 0 -; CHECK-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 16 -; CHECK-NEXT: [[TMP30:%.*]] = sub i64 [[UMAX]], [[TMP29]] -; CHECK-NEXT: [[TMP31:%.*]] = icmp ugt i64 [[UMAX]], [[TMP29]] -; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i64 [[TMP30]], i64 0 +; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP14:%.*]] = sub i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP16:%.*]] = sub i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP17]], i64 [[TMP13]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = select i1 [[TMP17]], i64 [[TMP13]], i64 0 +; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP17]], i64 [[TMP13]], i64 0 +; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP17]], i64 [[TMP13]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]]) @@ -204,86 +188,86 @@ ; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT13:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT14:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT15:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[INDEX6]], 0 -; CHECK-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 4 -; CHECK-NEXT: [[TMP36:%.*]] = add i64 [[TMP35]], 0 -; CHECK-NEXT: [[TMP37:%.*]] = mul i64 [[TMP36]], 1 -; CHECK-NEXT: [[TMP38:%.*]] = add i64 [[INDEX6]], [[TMP37]] -; CHECK-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 8 -; CHECK-NEXT: [[TMP41:%.*]] = add i64 [[TMP40]], 0 -; CHECK-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 1 -; CHECK-NEXT: [[TMP43:%.*]] = add i64 [[INDEX6]], [[TMP42]] -; CHECK-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 12 -; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[TMP45]], 0 -; CHECK-NEXT: [[TMP47:%.*]] = mul i64 [[TMP46]], 1 -; CHECK-NEXT: [[TMP48:%.*]] = add i64 [[INDEX6]], [[TMP47]] -; CHECK-NEXT: [[TMP49:%.*]] = getelementptr i32, ptr [[COND_PTR:%.*]], i64 [[TMP33]] -; CHECK-NEXT: [[TMP50:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP38]] -; CHECK-NEXT: [[TMP51:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP43]] -; CHECK-NEXT: [[TMP52:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP48]] -; CHECK-NEXT: [[TMP53:%.*]] = getelementptr i32, ptr [[TMP49]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP53]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP54:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP54]], 4 -; CHECK-NEXT: [[TMP56:%.*]] = getelementptr i32, ptr [[TMP49]], i64 [[TMP55]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP56]], i32 4, [[ACTIVE_LANE_MASK7]], poison) -; CHECK-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP58:%.*]] = mul i64 [[TMP57]], 8 -; CHECK-NEXT: [[TMP59:%.*]] = getelementptr i32, ptr [[TMP49]], i64 [[TMP58]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP59]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-NEXT: [[TMP60:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP61:%.*]] = mul i64 [[TMP60]], 12 -; CHECK-NEXT: [[TMP62:%.*]] = getelementptr i32, ptr [[TMP49]], i64 [[TMP61]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP62]], i32 4, [[ACTIVE_LANE_MASK9]], poison) -; CHECK-NEXT: [[TMP63:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP64:%.*]] = icmp ne [[WIDE_MASKED_LOAD10]], zeroinitializer -; CHECK-NEXT: [[TMP65:%.*]] = icmp ne [[WIDE_MASKED_LOAD11]], zeroinitializer -; CHECK-NEXT: [[TMP66:%.*]] = icmp ne [[WIDE_MASKED_LOAD12]], zeroinitializer -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP33]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP38]] -; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP43]] -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP48]] -; CHECK-NEXT: [[TMP71:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP63]], zeroinitializer -; CHECK-NEXT: [[TMP72:%.*]] = select [[ACTIVE_LANE_MASK7]], [[TMP64]], zeroinitializer -; CHECK-NEXT: [[TMP73:%.*]] = select [[ACTIVE_LANE_MASK8]], [[TMP65]], zeroinitializer -; CHECK-NEXT: [[TMP74:%.*]] = select [[ACTIVE_LANE_MASK9]], [[TMP66]], zeroinitializer -; CHECK-NEXT: [[TMP75:%.*]] = getelementptr i32, ptr [[TMP67]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP75]], i32 4, [[TMP71]]) -; CHECK-NEXT: [[TMP76:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP77:%.*]] = mul i64 [[TMP76]], 4 -; CHECK-NEXT: [[TMP78:%.*]] = getelementptr i32, ptr [[TMP67]], i64 [[TMP77]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP78]], i32 4, [[TMP72]]) -; CHECK-NEXT: [[TMP79:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP80:%.*]] = mul i64 [[TMP79]], 8 -; CHECK-NEXT: [[TMP81:%.*]] = getelementptr i32, ptr [[TMP67]], i64 [[TMP80]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP81]], i32 4, [[TMP73]]) -; CHECK-NEXT: [[TMP82:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP83:%.*]] = mul i64 [[TMP82]], 12 -; CHECK-NEXT: [[TMP84:%.*]] = getelementptr i32, ptr [[TMP67]], i64 [[TMP83]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP84]], i32 4, [[TMP74]]) -; CHECK-NEXT: [[TMP85:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP86:%.*]] = mul i64 [[TMP85]], 4 -; CHECK-NEXT: [[TMP87:%.*]] = add i64 [[INDEX6]], [[TMP86]] -; CHECK-NEXT: [[TMP88:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP89:%.*]] = mul i64 [[TMP88]], 8 -; CHECK-NEXT: [[TMP90:%.*]] = add i64 [[INDEX6]], [[TMP89]] -; CHECK-NEXT: [[TMP91:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP92:%.*]] = mul i64 [[TMP91]], 12 -; CHECK-NEXT: [[TMP93:%.*]] = add i64 [[INDEX6]], [[TMP92]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP17]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP87]], i64 [[TMP22]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP90]], i64 [[TMP27]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT15]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP93]], i64 [[TMP32]]) +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX6]], 0 +; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 4 +; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP27]], 0 +; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 1 +; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[INDEX6]], [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 8 +; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[TMP32]], 0 +; CHECK-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], 1 +; CHECK-NEXT: [[TMP35:%.*]] = add i64 [[INDEX6]], [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP37:%.*]] = mul i64 [[TMP36]], 12 +; CHECK-NEXT: [[TMP38:%.*]] = add i64 [[TMP37]], 0 +; CHECK-NEXT: [[TMP39:%.*]] = mul i64 [[TMP38]], 1 +; CHECK-NEXT: [[TMP40:%.*]] = add i64 [[INDEX6]], [[TMP39]] +; CHECK-NEXT: [[TMP41:%.*]] = getelementptr i32, ptr [[COND_PTR:%.*]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP42:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP35]] +; CHECK-NEXT: [[TMP44:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP40]] +; CHECK-NEXT: [[TMP45:%.*]] = getelementptr i32, ptr [[TMP41]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP45]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP46:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP47:%.*]] = mul i64 [[TMP46]], 4 +; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i32, ptr [[TMP41]], i64 [[TMP47]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP48]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-NEXT: [[TMP49:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP49]], 8 +; CHECK-NEXT: [[TMP51:%.*]] = getelementptr i32, ptr [[TMP41]], i64 [[TMP50]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 12 +; CHECK-NEXT: [[TMP54:%.*]] = getelementptr i32, ptr [[TMP41]], i64 [[TMP53]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK9]], poison) +; CHECK-NEXT: [[TMP55:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP56:%.*]] = icmp ne [[WIDE_MASKED_LOAD10]], zeroinitializer +; CHECK-NEXT: [[TMP57:%.*]] = icmp ne [[WIDE_MASKED_LOAD11]], zeroinitializer +; CHECK-NEXT: [[TMP58:%.*]] = icmp ne [[WIDE_MASKED_LOAD12]], zeroinitializer +; CHECK-NEXT: [[TMP59:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP60:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP61:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP35]] +; CHECK-NEXT: [[TMP62:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP40]] +; CHECK-NEXT: [[TMP63:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP55]], zeroinitializer +; CHECK-NEXT: [[TMP64:%.*]] = select [[ACTIVE_LANE_MASK7]], [[TMP56]], zeroinitializer +; CHECK-NEXT: [[TMP65:%.*]] = select [[ACTIVE_LANE_MASK8]], [[TMP57]], zeroinitializer +; CHECK-NEXT: [[TMP66:%.*]] = select [[ACTIVE_LANE_MASK9]], [[TMP58]], zeroinitializer +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[TMP59]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP67]], i32 4, [[TMP63]]) +; CHECK-NEXT: [[TMP68:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP69:%.*]] = mul i64 [[TMP68]], 4 +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP59]], i64 [[TMP69]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP70]], i32 4, [[TMP64]]) +; CHECK-NEXT: [[TMP71:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP72:%.*]] = mul i64 [[TMP71]], 8 +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, ptr [[TMP59]], i64 [[TMP72]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP73]], i32 4, [[TMP65]]) +; CHECK-NEXT: [[TMP74:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP75:%.*]] = mul i64 [[TMP74]], 12 +; CHECK-NEXT: [[TMP76:%.*]] = getelementptr i32, ptr [[TMP59]], i64 [[TMP75]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP76]], i32 4, [[TMP66]]) +; CHECK-NEXT: [[TMP77:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP78:%.*]] = mul i64 [[TMP77]], 4 +; CHECK-NEXT: [[TMP79:%.*]] = add i64 [[INDEX6]], [[TMP78]] +; CHECK-NEXT: [[TMP80:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP81:%.*]] = mul i64 [[TMP80]], 8 +; CHECK-NEXT: [[TMP82:%.*]] = add i64 [[INDEX6]], [[TMP81]] +; CHECK-NEXT: [[TMP83:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP84:%.*]] = mul i64 [[TMP83]], 12 +; CHECK-NEXT: [[TMP85:%.*]] = add i64 [[INDEX6]], [[TMP84]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP21]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP79]], i64 [[TMP22]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP82]], i64 [[TMP23]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT15]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP85]], i64 [[TMP24]]) ; CHECK-NEXT: [[INDEX_NEXT16]] = add i64 [[INDEX6]], [[TMP6]] -; CHECK-NEXT: [[TMP94:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP95:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP96:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP97:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP98:%.*]] = extractelement [[TMP94]], i32 0 -; CHECK-NEXT: br i1 [[TMP98]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP86:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP87:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP88:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP89:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP90:%.*]] = extractelement [[TMP86]], i32 0 +; CHECK-NEXT: br i1 [[TMP90]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: