diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -641,10 +641,6 @@ /// the block that was created for it. void sinkScalarOperands(Instruction *PredInst); - /// Shrinks vector element sizes to the smallest bitwidth they can be legally - /// represented as. - void truncateToMinimalBitwidths(VPTransformState &State); - /// Returns (and creates if needed) the trip count of the widened loop. Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); @@ -3429,151 +3425,8 @@ return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; } -void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { - // For every instruction `I` in MinBWs, truncate the operands, create a - // truncated version of `I` and reextend its result. InstCombine runs - // later and will remove any ext/trunc pairs. - SmallPtrSet Erased; - for (const auto &KV : Cost->getMinimalBitwidths()) { - // If the value wasn't vectorized, we must maintain the original scalar - // type. The absence of the value from State indicates that it - // wasn't vectorized. - // FIXME: Should not rely on getVPValue at this point. - VPValue *Def = State.Plan->getVPValue(KV.first, true); - if (!State.hasAnyVectorValue(Def)) - continue; - // If the instruction is defined outside the loop, only update the first - // part; the first part will be re-used for all other parts. - unsigned UFToUse = OrigLoop->contains(KV.first) ? UF : 1; - for (unsigned Part = 0; Part < UFToUse; ++Part) { - Value *I = State.get(Def, Part); - if (Erased.count(I) || I->use_empty() || !isa(I)) - continue; - Type *OriginalTy = I->getType(); - Type *ScalarTruncatedTy = - IntegerType::get(OriginalTy->getContext(), KV.second); - auto *TruncatedTy = VectorType::get( - ScalarTruncatedTy, cast(OriginalTy)->getElementCount()); - if (TruncatedTy == OriginalTy) - continue; - - IRBuilder<> B(cast(I)); - auto ShrinkOperand = [&](Value *V) -> Value * { - if (auto *ZI = dyn_cast(V)) - if (ZI->getSrcTy() == TruncatedTy) - return ZI->getOperand(0); - return B.CreateZExtOrTrunc(V, TruncatedTy); - }; - - // The actual instruction modification depends on the instruction type, - // unfortunately. - Value *NewI = nullptr; - if (auto *BO = dyn_cast(I)) { - Value *Op0 = ShrinkOperand(BO->getOperand(0)); - Value *Op1 = ShrinkOperand(BO->getOperand(1)); - NewI = B.CreateBinOp(BO->getOpcode(), Op0, Op1); - - // Any wrapping introduced by shrinking this operation shouldn't be - // considered undefined behavior. So, we can't unconditionally copy - // arithmetic wrapping flags to NewI. - cast(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); - } else if (auto *CI = dyn_cast(I)) { - Value *Op0 = ShrinkOperand(BO->getOperand(0)); - Value *Op1 = ShrinkOperand(BO->getOperand(1)); - NewI = B.CreateICmp(CI->getPredicate(), Op0, Op1); - } else if (auto *SI = dyn_cast(I)) { - Value *TV = ShrinkOperand(SI->getTrueValue()); - Value *FV = ShrinkOperand(SI->getFalseValue()); - NewI = B.CreateSelect(SI->getCondition(), TV, FV); - } else if (auto *CI = dyn_cast(I)) { - switch (CI->getOpcode()) { - default: - llvm_unreachable("Unhandled cast!"); - case Instruction::Trunc: - NewI = ShrinkOperand(CI->getOperand(0)); - break; - case Instruction::SExt: - NewI = B.CreateSExtOrTrunc( - CI->getOperand(0), - smallestIntegerVectorType(OriginalTy, TruncatedTy)); - break; - case Instruction::ZExt: - NewI = B.CreateZExtOrTrunc( - CI->getOperand(0), - smallestIntegerVectorType(OriginalTy, TruncatedTy)); - break; - } - } else if (auto *SI = dyn_cast(I)) { - auto Elements0 = - cast(SI->getOperand(0)->getType())->getElementCount(); - auto *O0 = B.CreateZExtOrTrunc( - SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); - auto Elements1 = - cast(SI->getOperand(1)->getType())->getElementCount(); - auto *O1 = B.CreateZExtOrTrunc( - SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); - - NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); - } else if (isa(I) || isa(I)) { - // Don't do anything with the operands, just extend the result. - continue; - } else if (auto *IE = dyn_cast(I)) { - auto Elements = - cast(IE->getOperand(0)->getType())->getElementCount(); - auto *O0 = B.CreateZExtOrTrunc( - IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); - auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); - NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); - } else if (auto *EE = dyn_cast(I)) { - auto Elements = - cast(EE->getOperand(0)->getType())->getElementCount(); - auto *O0 = B.CreateZExtOrTrunc( - EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); - NewI = B.CreateExtractElement(O0, EE->getOperand(2)); - } else { - // If we don't know what to do, be conservative and don't do anything. - continue; - } - - // Lastly, extend the result. - NewI->takeName(cast(I)); - Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); - I->replaceAllUsesWith(Res); - cast(I)->eraseFromParent(); - Erased.insert(I); - State.reset(Def, Res, Part); - } - } - - // We'll have created a bunch of ZExts that are now parentless. Clean up. - for (const auto &KV : Cost->getMinimalBitwidths()) { - // If the value wasn't vectorized, we must maintain the original scalar - // type. The absence of the value from State indicates that it - // wasn't vectorized. - // FIXME: Should not rely on getVPValue at this point. - VPValue *Def = State.Plan->getVPValue(KV.first, true); - if (!State.hasAnyVectorValue(Def)) - continue; - unsigned UFToUse = OrigLoop->contains(KV.first) ? UF : 1; - for (unsigned Part = 0; Part < UFToUse; ++Part) { - Value *I = State.get(Def, Part); - ZExtInst *Inst = dyn_cast(I); - if (Inst && Inst->use_empty()) { - Value *NewI = Inst->getOperand(0); - Inst->eraseFromParent(); - State.reset(Def, NewI, Part); - } - } - } -} - void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, VPlan &Plan) { - // Insert truncates and extends for any truncated instructions as hints to - // InstCombine. - if (VF.isVector()) - truncateToMinimalBitwidths(State); - // Fix widened non-induction PHIs by setting up the PHI operands. if (EnableVPlanNativePath) fixNonInductionPHIs(Plan, State); @@ -8741,7 +8594,11 @@ VFRange SubRange = {VF, MaxVFTimes2}; if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) { // Now optimize the initial VPlan. + if (!Plan->hasVF(ElementCount::getFixed(1))) + VPlanTransforms::truncateToMinimalBitwidths( + *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext()); VPlanTransforms::optimize(*Plan, *PSE.getSE()); + assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); VPlans.push_back(std::move(Plan)); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -275,10 +275,6 @@ I->second[Part]; } - bool hasAnyVectorValue(VPValue *Def) const { - return Data.PerPartOutput.contains(Def); - } - bool hasScalarValue(VPValue *Def, VPIteration Instance) { auto I = Data.PerPartScalars.find(Def); if (I == Data.PerPartScalars.end()) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -79,6 +79,13 @@ bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck); + /// Insert truncates and extends for any truncated recipe. Redundant casts + /// will be folded later. + static void + truncateToMinimalBitwidths(VPlan &Plan, + const MapVector &MinBWs, + LLVMContext &Ctx); + private: /// Remove redundant VPBasicBlocks by merging them into their predecessor if /// the predecessor has a single successor. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -870,6 +870,103 @@ } } +void VPlanTransforms::truncateToMinimalBitwidths( + VPlan &Plan, const MapVector &MinBWs, + LLVMContext &Ctx) { +#ifndef NDEBUG + // Count the processed recipes and cross check the count later with MinBWs + // size, to make sure all entries in MinBWs have been handled. + unsigned NumProcessedRecipes = 0; +#endif + // Keep track of created truncates, so they can be re-used. Note that we + // cannot use RAUW after creating a new truncate, as this would could make + // other uses have different types for their operands, making them invalidly + // typed. + DenseMap ProcessedTruncs; + VPTypeAnalysis TypeInfo(Ctx); + VPBasicBlock *PH = Plan.getEntry(); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_deep(Plan.getVectorLoopRegion()))) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + if (!isa(&R)) + continue; + + VPValue *ResultVPV = R.getVPSingleValue(); + auto *UI = cast_or_null(ResultVPV->getUnderlyingValue()); + unsigned NewResSizeInBits = MinBWs.lookup(UI); + if (!NewResSizeInBits) + continue; + +#ifndef NDEBUG + NumProcessedRecipes++; +#endif + // If the value wasn't vectorized, we must maintain the original scalar + // type. Skip those here, after incrementing NumProcessedRecipes. Also + // skip casts which do not need to be handled explicitly here, as + // redundant casts will be removed during recipe simplification. + if (isa(&R)) + continue; + + Type *OldResTy = TypeInfo.inferScalarType(ResultVPV); + unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits(); + assert(OldResTy->isIntegerTy() && "only integer types supported"); + assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?"); + + auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits); + + // Shrink operands by introducing truncates as needed. + unsigned StartIdx = isa(&R) ? 1 : 0; + for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) { + auto *Op = R.getOperand(Idx); + unsigned OpSizeInBits = + TypeInfo.inferScalarType(Op)->getScalarSizeInBits(); + if (OpSizeInBits == NewResSizeInBits) + continue; + assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate"); + auto [ProcessedIter, IterIsEmpty] = + ProcessedTruncs.insert({Op, nullptr}); + VPWidenCastRecipe *NewOp = + IterIsEmpty + ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy) + : ProcessedIter->second; + R.setOperand(Idx, NewOp); + if (!IterIsEmpty) + continue; + ProcessedIter->second = NewOp; + if (!Op->isLiveIn()) { + NewOp->insertBefore(&R); + } else { + PH->appendRecipe(NewOp); +#ifndef NDEBUG + auto *OpInst = dyn_cast(Op->getLiveInIRValue()); + bool IsContained = MinBWs.contains(OpInst); + assert((!OpInst || IsContained) && + "All processed instructions should be contained in MinBWs."); + NumProcessedRecipes += IsContained; +#endif + } + R.setOperand(Idx, ProcessedIter->second); + } + + // Any wrapping introduced by shrinking this operation shouldn't be + // considered undefined behavior. So, we can't unconditionally copy + // arithmetic wrapping flags to VPW. + if (auto *VPW = dyn_cast(&R)) + VPW->dropPoisonGeneratingFlags(); + + // Extend result to original width. + auto *Ext = new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy); + Ext->insertAfter(&R); + ResultVPV->replaceAllUsesWith(Ext); + Ext->setOperand(0, ResultVPV); + } + } + + assert(MinBWs.size() == NumProcessedRecipes && + "some entries in MinBWs haven't been processed"); +} + void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) { removeRedundantCanonicalIVs(Plan); removeRedundantInductionCasts(Plan); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll @@ -39,14 +39,13 @@ ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 ; CHECK-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i16> -; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i16> -; CHECK-NEXT: [[TMP11:%.*]] = mul nuw <16 x i16> [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = lshr <16 x i16> [[TMP11]], -; CHECK-NEXT: [[TMP13:%.*]] = trunc <16 x i16> [[TMP12]] to <16 x i8> -; CHECK-NEXT: store <16 x i8> [[TMP13]], ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw <16 x i16> [[TMP9]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = lshr <16 x i16> [[TMP10]], +; CHECK-NEXT: [[TMP12:%.*]] = trunc <16 x i16> [[TMP11]] to <16 x i8> +; CHECK-NEXT: store <16 x i8> [[TMP12]], ptr [[TMP8]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -60,27 +59,26 @@ ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: ; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX7]] -; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i8>, ptr [[TMP15]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX7]] -; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <8 x i8>, ptr [[TMP16]], align 1 -; CHECK-NEXT: [[TMP17:%.*]] = zext <8 x i8> [[WIDE_LOAD9]] to <8 x i16> -; CHECK-NEXT: [[TMP18:%.*]] = zext <8 x i8> [[WIDE_LOAD8]] to <8 x i16> -; CHECK-NEXT: [[TMP19:%.*]] = mul nuw <8 x i16> [[TMP17]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = lshr <8 x i16> [[TMP19]], -; CHECK-NEXT: [[TMP21:%.*]] = trunc <8 x i16> [[TMP20]] to <8 x i8> -; CHECK-NEXT: store <8 x i8> [[TMP21]], ptr [[TMP16]], align 1 -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX7]] -; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i8>, ptr [[TMP22]], align 1 -; CHECK-NEXT: [[TMP23:%.*]] = zext <8 x i8> [[WIDE_LOAD10]] to <8 x i16> -; CHECK-NEXT: [[TMP24:%.*]] = zext <8 x i8> [[WIDE_LOAD8]] to <8 x i16> -; CHECK-NEXT: [[TMP25:%.*]] = mul nuw <8 x i16> [[TMP23]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = lshr <8 x i16> [[TMP25]], -; CHECK-NEXT: [[TMP27:%.*]] = trunc <8 x i16> [[TMP26]] to <8 x i8> -; CHECK-NEXT: store <8 x i8> [[TMP27]], ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX7]] +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i8>, ptr [[TMP14]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX7]] +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <8 x i8>, ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = zext <8 x i8> [[WIDE_LOAD9]] to <8 x i16> +; CHECK-NEXT: [[TMP17:%.*]] = zext <8 x i8> [[WIDE_LOAD8]] to <8 x i16> +; CHECK-NEXT: [[TMP18:%.*]] = mul nuw <8 x i16> [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = lshr <8 x i16> [[TMP18]], +; CHECK-NEXT: [[TMP20:%.*]] = trunc <8 x i16> [[TMP19]] to <8 x i8> +; CHECK-NEXT: store <8 x i8> [[TMP20]], ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX7]] +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i8>, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = zext <8 x i8> [[WIDE_LOAD10]] to <8 x i16> +; CHECK-NEXT: [[TMP23:%.*]] = mul nuw <8 x i16> [[TMP22]], [[TMP17]] +; CHECK-NEXT: [[TMP24:%.*]] = lshr <8 x i16> [[TMP23]], +; CHECK-NEXT: [[TMP25:%.*]] = trunc <8 x i16> [[TMP24]] to <8 x i8> +; CHECK-NEXT: store <8 x i8> [[TMP25]], ptr [[TMP21]], align 1 ; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX7]], 8 -; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC5]] -; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[N_VEC5]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -94,18 +92,18 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP29]] to i32 +; CHECK-NEXT: [[TMP27:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP27]] to i32 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP30:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 -; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[TMP30]] to i32 +; CHECK-NEXT: [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[TMP28]] to i32 ; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[CONV3]], [[CONV]] ; CHECK-NEXT: [[SHR_26:%.*]] = lshr i32 [[MUL]], 8 ; CHECK-NEXT: [[CONV4:%.*]] = trunc i32 [[SHR_26]] to i8 ; CHECK-NEXT: store i8 [[CONV4]], ptr [[ARRAYIDX2]], align 1 ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[ARRAYIDX8]], align 1 -; CHECK-NEXT: [[CONV9:%.*]] = zext i8 [[TMP31]] to i32 +; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX8]], align 1 +; CHECK-NEXT: [[CONV9:%.*]] = zext i8 [[TMP29]] to i32 ; CHECK-NEXT: [[MUL10:%.*]] = mul nuw nsw i32 [[CONV9]], [[CONV]] ; CHECK-NEXT: [[SHR11_27:%.*]] = lshr i32 [[MUL10]], 8 ; CHECK-NEXT: [[CONV12:%.*]] = trunc i32 [[SHR11_27]] to i8 @@ -158,54 +156,57 @@ ; CHECK-LABEL: define void @test_shrink_zext_in_preheader ; CHECK-SAME: (ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[A:%.*]], i16 [[B:%.*]]) { ; CHECK-NEXT: iter.check: +; CHECK-NEXT: [[CONV10:%.*]] = zext i16 [[B]] to i32 ; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[A]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i16> undef, i16 [[B]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i16> [[TMP0]], <16 x i16> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT]] to <16 x i16> +; CHECK-NEXT: [[TMP1:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT]] to <16 x i16> +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i32> poison, i32 [[CONV10]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT1]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT2]] to <16 x i16> +; CHECK-NEXT: [[TMP3:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT2]] to <16 x i16> ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT]] to <16 x i16> -; CHECK-NEXT: [[TMP2:%.*]] = mul <16 x i16> [[BROADCAST_SPLAT2]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT]] to <16 x i16> -; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i16> [[BROADCAST_SPLAT2]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = lshr <16 x i16> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i16> [[TMP0]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = mul <16 x i16> [[TMP1]], [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = lshr <16 x i16> [[TMP4]], -; CHECK-NEXT: [[TMP7:%.*]] = trunc <16 x i16> [[TMP5]] to <16 x i8> +; CHECK-NEXT: [[TMP7:%.*]] = lshr <16 x i16> [[TMP5]], ; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i16> [[TMP6]] to <16 x i8> -; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[INDEX]] to i64 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP9]] -; CHECK-NEXT: store <16 x i8> [[TMP7]], ptr [[TMP10]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 16 +; CHECK-NEXT: [[TMP9:%.*]] = trunc <16 x i16> [[TMP7]] to <16 x i8> +; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[INDEX]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP10]] ; CHECK-NEXT: store <16 x i8> [[TMP8]], ptr [[TMP11]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 16 +; CHECK-NEXT: store <16 x i8> [[TMP9]], ptr [[TMP12]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i16> undef, i16 [[B]], i64 0 -; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] -; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX4:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP14:%.*]] = trunc i32 [[A]] to i16 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i16> undef, i16 [[TMP14]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = mul <8 x i16> [[TMP15]], [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = lshr <8 x i16> [[TMP16]], -; CHECK-NEXT: [[TMP18:%.*]] = trunc <8 x i16> [[TMP17]] to <8 x i8> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x i8> [[TMP18]], <8 x i8> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = sext i32 [[INDEX4]] to i64 -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP20]] -; CHECK-NEXT: store <8 x i8> [[TMP19]], ptr [[TMP21]], align 1 -; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i32 [[INDEX4]], 8 -; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT9]], 1000 -; CHECK-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> undef, i16 [[B]], i64 0 +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX7:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP17:%.*]] = mul <8 x i16> [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = lshr <8 x i16> [[TMP17]], +; CHECK-NEXT: [[TMP19:%.*]] = trunc <8 x i16> [[TMP18]] to <8 x i8> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i8> [[TMP19]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = sext i32 [[INDEX7]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP21]] +; CHECK-NEXT: store <8 x i8> [[TMP20]], ptr [[TMP22]], align 1 +; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i32 [[INDEX7]], 8 +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT8]], 1000 +; CHECK-NEXT: br i1 [[TMP23]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: @@ -245,11 +246,11 @@ ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[A]] to i16 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i16> undef, i16 [[TMP0]], i64 0 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[A]] to i16 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i16> undef, i16 [[TMP0]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = mul <16 x i16> [[TMP1]], ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = lshr <16 x i16> [[TMP3]], @@ -266,21 +267,21 @@ ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] -; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX2:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP10:%.*]] = trunc i32 [[A]] to i16 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i16> undef, i16 [[TMP10]], i64 0 +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX3:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP12:%.*]] = mul <8 x i16> [[TMP11]], ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i16> [[TMP12]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = lshr <8 x i16> [[TMP13]], ; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[C]], <8 x i16> [[TMP14]], <8 x i16> [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = trunc <8 x i16> [[TMP15]] to <8 x i8> -; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[INDEX2]] to i64 +; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[INDEX3]] to i64 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP17]] ; CHECK-NEXT: store <8 x i8> [[TMP16]], ptr [[TMP18]], align 1 -; CHECK-NEXT: [[INDEX_NEXT5]] = add nuw i32 [[INDEX2]], 8 -; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT5]], 1000 +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i32 [[INDEX3]], 8 +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT4]], 1000 ; CHECK-NEXT: br i1 [[TMP19]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -28,14 +28,12 @@ ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-NEXT: [[TMP4:%.*]] = add <16 x i8> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = trunc <16 x i32> [[TMP5]] to <16 x i8> -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 -; CHECK-NEXT: store <16 x i8> [[TMP6]], ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store <16 x i8> [[TMP4]], ptr [[TMP6]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -50,19 +48,17 @@ ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: ; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX5]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i8>, ptr [[TMP12]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = add <8 x i8> [[WIDE_LOAD6]], -; CHECK-NEXT: [[TMP14:%.*]] = zext <8 x i8> [[TMP13]] to <8 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = trunc <8 x i32> [[TMP14]] to <8 x i8> -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 -; CHECK-NEXT: store <8 x i8> [[TMP15]], ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX5]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i8>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = add <8 x i8> [[WIDE_LOAD6]], +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0 +; CHECK-NEXT: store <8 x i8> [[TMP11]], ptr [[TMP13]], align 1 ; CHECK-NEXT: [[INDEX_NEXT7]] = add nuw i64 [[INDEX5]], 8 -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N4]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -76,8 +72,8 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP19]] to i32 +; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP15]] to i32 ; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV]], 2 ; CHECK-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i8 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDVARS_IV]] @@ -229,14 +225,12 @@ ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i16> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = trunc <8 x i32> [[TMP5]] to <8 x i16> -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i32 0 -; CHECK-NEXT: store <8 x i16> [[TMP6]], ptr [[TMP8]], align 2 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store <8 x i16> [[TMP4]], ptr [[TMP6]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -250,8 +244,8 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 -; CHECK-NEXT: [[CONV8:%.*]] = zext i16 [[TMP10]] to i32 +; CHECK-NEXT: [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[CONV8:%.*]] = zext i16 [[TMP8]] to i32 ; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV8]], 2 ; CHECK-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDVARS_IV]] @@ -307,17 +301,13 @@ ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i16> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = trunc <16 x i32> [[TMP5]] to <16 x i16> -; CHECK-NEXT: [[TMP7:%.*]] = add <16 x i16> [[TMP6]], -; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i16> [[TMP7]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = trunc <16 x i32> [[TMP8]] to <16 x i16> -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0 -; CHECK-NEXT: store <16 x i16> [[TMP9]], ptr [[TMP11]], align 2 +; CHECK-NEXT: [[TMP5:%.*]] = add <16 x i16> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store <16 x i16> [[TMP5]], ptr [[TMP7]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -332,22 +322,18 @@ ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: ; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX5]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i8>, ptr [[TMP15]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = zext <8 x i8> [[WIDE_LOAD6]] to <8 x i16> -; CHECK-NEXT: [[TMP17:%.*]] = zext <8 x i16> [[TMP16]] to <8 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = trunc <8 x i32> [[TMP17]] to <8 x i16> -; CHECK-NEXT: [[TMP19:%.*]] = add <8 x i16> [[TMP18]], -; CHECK-NEXT: [[TMP20:%.*]] = zext <8 x i16> [[TMP19]] to <8 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = trunc <8 x i32> [[TMP20]] to <8 x i16> -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[TMP22]], i32 0 -; CHECK-NEXT: store <8 x i16> [[TMP21]], ptr [[TMP23]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX5]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i8>, ptr [[TMP11]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = zext <8 x i8> [[WIDE_LOAD6]] to <8 x i16> +; CHECK-NEXT: [[TMP13:%.*]] = add <8 x i16> [[TMP12]], +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP14]], i32 0 +; CHECK-NEXT: store <8 x i16> [[TMP13]], ptr [[TMP15]], align 2 ; CHECK-NEXT: [[INDEX_NEXT7]] = add nuw i64 [[INDEX5]], 8 -; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP24]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP16]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N4]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -361,8 +347,8 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP25:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP25]] to i32 +; CHECK-NEXT: [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP17]] to i32 ; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV]], 2 ; CHECK-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDVARS_IV]] @@ -485,52 +471,32 @@ ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[CONV13]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = trunc <16 x i32> [[BROADCAST_SPLATINSERT]] to <16 x i8> -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT]] to <16 x i8> ; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i32> poison, i32 [[CONV11]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = trunc <16 x i32> [[BROADCAST_SPLATINSERT2]] to <16 x i8> -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT3]] to <16 x i32> +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT2]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT3]] to <16 x i8> ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-NEXT: [[TMP9:%.*]] = shl <16 x i8> [[TMP8]], -; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = trunc <16 x i32> [[TMP10]] to <16 x i8> -; CHECK-NEXT: [[TMP12:%.*]] = add <16 x i8> [[TMP11]], -; CHECK-NEXT: [[TMP13:%.*]] = zext <16 x i8> [[TMP12]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i8> [[TMP8]], -; CHECK-NEXT: [[TMP15:%.*]] = zext <16 x i8> [[TMP14]] to <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = trunc <16 x i32> [[TMP15]] to <16 x i8> -; CHECK-NEXT: [[TMP17:%.*]] = mul <16 x i8> [[TMP16]], -; CHECK-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = trunc <16 x i32> [[TMP13]] to <16 x i8> -; CHECK-NEXT: [[TMP20:%.*]] = trunc <16 x i32> [[TMP2]] to <16 x i8> -; CHECK-NEXT: [[TMP21:%.*]] = and <16 x i8> [[TMP19]], [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = zext <16 x i8> [[TMP21]] to <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = trunc <16 x i32> [[TMP18]] to <16 x i8> -; CHECK-NEXT: [[TMP24:%.*]] = and <16 x i8> [[TMP23]], -; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = trunc <16 x i32> [[TMP25]] to <16 x i8> -; CHECK-NEXT: [[TMP27:%.*]] = trunc <16 x i32> [[TMP4]] to <16 x i8> -; CHECK-NEXT: [[TMP28:%.*]] = xor <16 x i8> [[TMP26]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = zext <16 x i8> [[TMP28]] to <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = trunc <16 x i32> [[TMP29]] to <16 x i8> -; CHECK-NEXT: [[TMP31:%.*]] = trunc <16 x i32> [[TMP22]] to <16 x i8> -; CHECK-NEXT: [[TMP32:%.*]] = mul <16 x i8> [[TMP30]], [[TMP31]] -; CHECK-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = trunc <16 x i32> [[TMP33]] to <16 x i8> -; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i32 0 -; CHECK-NEXT: store <16 x i8> [[TMP34]], ptr [[TMP36]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = shl <16 x i8> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP7:%.*]] = add <16 x i8> [[TMP6]], +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i8> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP9:%.*]] = mul <16 x i8> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = and <16 x i8> [[TMP7]], [[TMP1]] +; CHECK-NEXT: [[TMP11:%.*]] = and <16 x i8> [[TMP9]], +; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i8> [[TMP11]], [[TMP2]] +; CHECK-NEXT: [[TMP13:%.*]] = mul <16 x i8> [[TMP12]], [[TMP10]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 +; CHECK-NEXT: store <16 x i8> [[TMP13]], ptr [[TMP15]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -542,53 +508,33 @@ ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 8 ; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF4]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x i32> poison, i32 [[CONV13]], i64 0 -; CHECK-NEXT: [[TMP38:%.*]] = trunc <8 x i32> [[BROADCAST_SPLATINSERT8]] to <8 x i8> -; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <8 x i8> [[TMP38]], <8 x i8> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP39:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT9]] to <8 x i32> -; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <8 x i32> poison, i32 [[CONV11]], i64 0 -; CHECK-NEXT: [[TMP40:%.*]] = trunc <8 x i32> [[BROADCAST_SPLATINSERT10]] to <8 x i8> -; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <8 x i8> [[TMP40]], <8 x i8> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP41:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT11]] to <8 x i32> +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i32> poison, i32 [[CONV13]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT7]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT8]] to <8 x i8> +; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <8 x i32> poison, i32 [[CONV11]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT9]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT10]] to <8 x i8> ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP42:%.*]] = add i64 [[INDEX7]], 0 -; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP42]] -; CHECK-NEXT: [[TMP44:%.*]] = getelementptr inbounds i8, ptr [[TMP43]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = load <8 x i8>, ptr [[TMP44]], align 1 -; CHECK-NEXT: [[TMP46:%.*]] = shl <8 x i8> [[TMP45]], -; CHECK-NEXT: [[TMP47:%.*]] = zext <8 x i8> [[TMP46]] to <8 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = trunc <8 x i32> [[TMP47]] to <8 x i8> -; CHECK-NEXT: [[TMP49:%.*]] = add <8 x i8> [[TMP48]], -; CHECK-NEXT: [[TMP50:%.*]] = zext <8 x i8> [[TMP49]] to <8 x i32> -; CHECK-NEXT: [[TMP51:%.*]] = or <8 x i8> [[TMP45]], -; CHECK-NEXT: [[TMP52:%.*]] = zext <8 x i8> [[TMP51]] to <8 x i32> -; CHECK-NEXT: [[TMP53:%.*]] = trunc <8 x i32> [[TMP52]] to <8 x i8> -; CHECK-NEXT: [[TMP54:%.*]] = mul <8 x i8> [[TMP53]], -; CHECK-NEXT: [[TMP55:%.*]] = zext <8 x i8> [[TMP54]] to <8 x i32> -; CHECK-NEXT: [[TMP56:%.*]] = trunc <8 x i32> [[TMP50]] to <8 x i8> -; CHECK-NEXT: [[TMP57:%.*]] = trunc <8 x i32> [[TMP39]] to <8 x i8> -; CHECK-NEXT: [[TMP58:%.*]] = and <8 x i8> [[TMP56]], [[TMP57]] -; CHECK-NEXT: [[TMP59:%.*]] = zext <8 x i8> [[TMP58]] to <8 x i32> -; CHECK-NEXT: [[TMP60:%.*]] = trunc <8 x i32> [[TMP55]] to <8 x i8> -; CHECK-NEXT: [[TMP61:%.*]] = and <8 x i8> [[TMP60]], -; CHECK-NEXT: [[TMP62:%.*]] = zext <8 x i8> [[TMP61]] to <8 x i32> -; CHECK-NEXT: [[TMP63:%.*]] = trunc <8 x i32> [[TMP62]] to <8 x i8> -; CHECK-NEXT: [[TMP64:%.*]] = trunc <8 x i32> [[TMP41]] to <8 x i8> -; CHECK-NEXT: [[TMP65:%.*]] = xor <8 x i8> [[TMP63]], [[TMP64]] -; CHECK-NEXT: [[TMP66:%.*]] = zext <8 x i8> [[TMP65]] to <8 x i32> -; CHECK-NEXT: [[TMP67:%.*]] = trunc <8 x i32> [[TMP66]] to <8 x i8> -; CHECK-NEXT: [[TMP68:%.*]] = trunc <8 x i32> [[TMP59]] to <8 x i8> -; CHECK-NEXT: [[TMP69:%.*]] = mul <8 x i8> [[TMP67]], [[TMP68]] -; CHECK-NEXT: [[TMP70:%.*]] = zext <8 x i8> [[TMP69]] to <8 x i32> -; CHECK-NEXT: [[TMP71:%.*]] = trunc <8 x i32> [[TMP70]] to <8 x i8> -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP42]] -; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[TMP72]], i32 0 -; CHECK-NEXT: store <8 x i8> [[TMP71]], ptr [[TMP73]], align 1 -; CHECK-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX7]], 8 -; CHECK-NEXT: [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC5]] -; CHECK-NEXT: br i1 [[TMP74]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX11]], 0 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = shl <8 x i8> [[WIDE_LOAD12]], +; CHECK-NEXT: [[TMP23:%.*]] = add <8 x i8> [[TMP22]], +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i8> [[WIDE_LOAD12]], +; CHECK-NEXT: [[TMP25:%.*]] = mul <8 x i8> [[TMP24]], +; CHECK-NEXT: [[TMP26:%.*]] = and <8 x i8> [[TMP23]], [[TMP17]] +; CHECK-NEXT: [[TMP27:%.*]] = and <8 x i8> [[TMP25]], +; CHECK-NEXT: [[TMP28:%.*]] = xor <8 x i8> [[TMP27]], [[TMP18]] +; CHECK-NEXT: [[TMP29:%.*]] = mul <8 x i8> [[TMP28]], [[TMP26]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP19]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0 +; CHECK-NEXT: store <8 x i8> [[TMP29]], ptr [[TMP31]], align 1 +; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX11]], 8 +; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[TMP32]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -602,8 +548,8 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP75:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP75]] to i32 +; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP33]] to i32 ; CHECK-NEXT: [[ADD:%.*]] = shl i32 [[CONV]], 4 ; CHECK-NEXT: [[CONV2:%.*]] = add nuw nsw i32 [[ADD]], 32 ; CHECK-NEXT: [[OR:%.*]] = or i32 [[CONV]], 51 @@ -673,58 +619,34 @@ ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[CONV13]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = trunc <16 x i32> [[BROADCAST_SPLATINSERT]] to <16 x i8> -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT]] to <16 x i8> ; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i32> poison, i32 [[CONV11]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = trunc <16 x i32> [[BROADCAST_SPLATINSERT2]] to <16 x i8> -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT3]] to <16 x i32> +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT2]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT3]] to <16 x i8> ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i16>, ptr [[TMP7]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i16> [[WIDE_LOAD]] to <16 x i8> -; CHECK-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = trunc <16 x i32> [[TMP9]] to <16 x i8> -; CHECK-NEXT: [[TMP11:%.*]] = shl <16 x i8> [[TMP10]], -; CHECK-NEXT: [[TMP12:%.*]] = zext <16 x i8> [[TMP11]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = trunc <16 x i32> [[TMP12]] to <16 x i8> -; CHECK-NEXT: [[TMP14:%.*]] = add <16 x i8> [[TMP13]], -; CHECK-NEXT: [[TMP15:%.*]] = zext <16 x i8> [[TMP14]] to <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = and <16 x i8> [[TMP8]], -; CHECK-NEXT: [[TMP17:%.*]] = zext <16 x i8> [[TMP16]] to <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = trunc <16 x i32> [[TMP17]] to <16 x i8> -; CHECK-NEXT: [[TMP19:%.*]] = or <16 x i8> [[TMP18]], -; CHECK-NEXT: [[TMP20:%.*]] = zext <16 x i8> [[TMP19]] to <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = trunc <16 x i32> [[TMP20]] to <16 x i8> -; CHECK-NEXT: [[TMP22:%.*]] = mul <16 x i8> [[TMP21]], -; CHECK-NEXT: [[TMP23:%.*]] = zext <16 x i8> [[TMP22]] to <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = trunc <16 x i32> [[TMP15]] to <16 x i8> -; CHECK-NEXT: [[TMP25:%.*]] = trunc <16 x i32> [[TMP2]] to <16 x i8> -; CHECK-NEXT: [[TMP26:%.*]] = and <16 x i8> [[TMP24]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = trunc <16 x i32> [[TMP23]] to <16 x i8> -; CHECK-NEXT: [[TMP29:%.*]] = and <16 x i8> [[TMP28]], -; CHECK-NEXT: [[TMP30:%.*]] = zext <16 x i8> [[TMP29]] to <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = trunc <16 x i32> [[TMP30]] to <16 x i8> -; CHECK-NEXT: [[TMP32:%.*]] = trunc <16 x i32> [[TMP4]] to <16 x i8> -; CHECK-NEXT: [[TMP33:%.*]] = xor <16 x i8> [[TMP31]], [[TMP32]] -; CHECK-NEXT: [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32> -; CHECK-NEXT: [[TMP35:%.*]] = trunc <16 x i32> [[TMP34]] to <16 x i8> -; CHECK-NEXT: [[TMP36:%.*]] = trunc <16 x i32> [[TMP27]] to <16 x i8> -; CHECK-NEXT: [[TMP37:%.*]] = mul <16 x i8> [[TMP35]], [[TMP36]] -; CHECK-NEXT: [[TMP38:%.*]] = zext <16 x i8> [[TMP37]] to <16 x i32> -; CHECK-NEXT: [[TMP39:%.*]] = trunc <16 x i32> [[TMP38]] to <16 x i8> -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds i8, ptr [[TMP40]], i32 0 -; CHECK-NEXT: store <16 x i8> [[TMP39]], ptr [[TMP41]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i16>, ptr [[TMP5]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = trunc <16 x i16> [[WIDE_LOAD]] to <16 x i8> +; CHECK-NEXT: [[TMP7:%.*]] = shl <16 x i8> [[TMP6]], +; CHECK-NEXT: [[TMP8:%.*]] = add <16 x i8> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = and <16 x i8> [[TMP6]], +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i8> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = mul <16 x i8> [[TMP10]], +; CHECK-NEXT: [[TMP12:%.*]] = and <16 x i8> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i8> [[TMP11]], +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i8> [[TMP13]], [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = mul <16 x i8> [[TMP14]], [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; CHECK-NEXT: store <16 x i8> [[TMP15]], ptr [[TMP17]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -736,59 +658,35 @@ ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 8 ; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF4]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <8 x i32> poison, i32 [[CONV13]], i64 0 -; CHECK-NEXT: [[TMP43:%.*]] = trunc <8 x i32> [[BROADCAST_SPLATINSERT9]] to <8 x i8> -; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <8 x i8> [[TMP43]], <8 x i8> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP44:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT10]] to <8 x i32> -; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <8 x i32> poison, i32 [[CONV11]], i64 0 -; CHECK-NEXT: [[TMP45:%.*]] = trunc <8 x i32> [[BROADCAST_SPLATINSERT11]] to <8 x i8> -; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <8 x i8> [[TMP45]], <8 x i8> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP46:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT12]] to <8 x i32> +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i32> poison, i32 [[CONV13]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT7]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT8]] to <8 x i8> +; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <8 x i32> poison, i32 [[CONV11]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT9]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT10]] to <8 x i8> ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP47:%.*]] = add i64 [[INDEX7]], 0 -; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP47]] -; CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds i16, ptr [[TMP48]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i16>, ptr [[TMP49]], align 2 -; CHECK-NEXT: [[TMP50:%.*]] = trunc <8 x i16> [[WIDE_LOAD8]] to <8 x i8> -; CHECK-NEXT: [[TMP51:%.*]] = zext <8 x i8> [[TMP50]] to <8 x i32> -; CHECK-NEXT: [[TMP52:%.*]] = trunc <8 x i32> [[TMP51]] to <8 x i8> -; CHECK-NEXT: [[TMP53:%.*]] = shl <8 x i8> [[TMP52]], -; CHECK-NEXT: [[TMP54:%.*]] = zext <8 x i8> [[TMP53]] to <8 x i32> -; CHECK-NEXT: [[TMP55:%.*]] = trunc <8 x i32> [[TMP54]] to <8 x i8> -; CHECK-NEXT: [[TMP56:%.*]] = add <8 x i8> [[TMP55]], -; CHECK-NEXT: [[TMP57:%.*]] = zext <8 x i8> [[TMP56]] to <8 x i32> -; CHECK-NEXT: [[TMP58:%.*]] = and <8 x i8> [[TMP50]], -; CHECK-NEXT: [[TMP59:%.*]] = zext <8 x i8> [[TMP58]] to <8 x i32> -; CHECK-NEXT: [[TMP60:%.*]] = trunc <8 x i32> [[TMP59]] to <8 x i8> -; CHECK-NEXT: [[TMP61:%.*]] = or <8 x i8> [[TMP60]], -; CHECK-NEXT: [[TMP62:%.*]] = zext <8 x i8> [[TMP61]] to <8 x i32> -; CHECK-NEXT: [[TMP63:%.*]] = trunc <8 x i32> [[TMP62]] to <8 x i8> -; CHECK-NEXT: [[TMP64:%.*]] = mul <8 x i8> [[TMP63]], -; CHECK-NEXT: [[TMP65:%.*]] = zext <8 x i8> [[TMP64]] to <8 x i32> -; CHECK-NEXT: [[TMP66:%.*]] = trunc <8 x i32> [[TMP57]] to <8 x i8> -; CHECK-NEXT: [[TMP67:%.*]] = trunc <8 x i32> [[TMP44]] to <8 x i8> -; CHECK-NEXT: [[TMP68:%.*]] = and <8 x i8> [[TMP66]], [[TMP67]] -; CHECK-NEXT: [[TMP69:%.*]] = zext <8 x i8> [[TMP68]] to <8 x i32> -; CHECK-NEXT: [[TMP70:%.*]] = trunc <8 x i32> [[TMP65]] to <8 x i8> -; CHECK-NEXT: [[TMP71:%.*]] = and <8 x i8> [[TMP70]], -; CHECK-NEXT: [[TMP72:%.*]] = zext <8 x i8> [[TMP71]] to <8 x i32> -; CHECK-NEXT: [[TMP73:%.*]] = trunc <8 x i32> [[TMP72]] to <8 x i8> -; CHECK-NEXT: [[TMP74:%.*]] = trunc <8 x i32> [[TMP46]] to <8 x i8> -; CHECK-NEXT: [[TMP75:%.*]] = xor <8 x i8> [[TMP73]], [[TMP74]] -; CHECK-NEXT: [[TMP76:%.*]] = zext <8 x i8> [[TMP75]] to <8 x i32> -; CHECK-NEXT: [[TMP77:%.*]] = trunc <8 x i32> [[TMP76]] to <8 x i8> -; CHECK-NEXT: [[TMP78:%.*]] = trunc <8 x i32> [[TMP69]] to <8 x i8> -; CHECK-NEXT: [[TMP79:%.*]] = mul <8 x i8> [[TMP77]], [[TMP78]] -; CHECK-NEXT: [[TMP80:%.*]] = zext <8 x i8> [[TMP79]] to <8 x i32> -; CHECK-NEXT: [[TMP81:%.*]] = trunc <8 x i32> [[TMP80]] to <8 x i8> -; CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP47]] -; CHECK-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[TMP82]], i32 0 -; CHECK-NEXT: store <8 x i8> [[TMP81]], ptr [[TMP83]], align 1 -; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX7]], 8 -; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC5]] -; CHECK-NEXT: br i1 [[TMP84]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX11]], 0 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[TMP22]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i16>, ptr [[TMP23]], align 2 +; CHECK-NEXT: [[TMP24:%.*]] = trunc <8 x i16> [[WIDE_LOAD12]] to <8 x i8> +; CHECK-NEXT: [[TMP25:%.*]] = shl <8 x i8> [[TMP24]], +; CHECK-NEXT: [[TMP26:%.*]] = add <8 x i8> [[TMP25]], +; CHECK-NEXT: [[TMP27:%.*]] = and <8 x i8> [[TMP24]], +; CHECK-NEXT: [[TMP28:%.*]] = or <8 x i8> [[TMP27]], +; CHECK-NEXT: [[TMP29:%.*]] = mul <8 x i8> [[TMP28]], +; CHECK-NEXT: [[TMP30:%.*]] = and <8 x i8> [[TMP26]], [[TMP19]] +; CHECK-NEXT: [[TMP31:%.*]] = and <8 x i8> [[TMP29]], +; CHECK-NEXT: [[TMP32:%.*]] = xor <8 x i8> [[TMP31]], [[TMP20]] +; CHECK-NEXT: [[TMP33:%.*]] = mul <8 x i8> [[TMP32]], [[TMP30]] +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP34]], i32 0 +; CHECK-NEXT: store <8 x i8> [[TMP33]], ptr [[TMP35]], align 1 +; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX11]], 8 +; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[TMP36]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -802,8 +700,8 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP85:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 -; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP85]] to i32 +; CHECK-NEXT: [[TMP37:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP37]] to i32 ; CHECK-NEXT: [[ADD:%.*]] = shl i32 [[CONV]], 4 ; CHECK-NEXT: [[CONV2:%.*]] = add nsw i32 [[ADD]], 32 ; CHECK-NEXT: [[OR:%.*]] = and i32 [[CONV]], 204 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll @@ -21,35 +21,33 @@ ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2 ; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i16> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP7:%.*]] = zext <4 x i16> [[TMP6]] to <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i32> [[TMP7]] to <4 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8 ; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 ; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; CHECK-NEXT: [[TMP15:%.*]] = ashr exact i64 [[TMP11]], 32 +; CHECK-NEXT: [[TMP16:%.*]] = ashr exact i64 [[TMP12]], 32 ; CHECK-NEXT: [[TMP17:%.*]] = ashr exact i64 [[TMP13]], 32 ; CHECK-NEXT: [[TMP18:%.*]] = ashr exact i64 [[TMP14]], 32 -; CHECK-NEXT: [[TMP19:%.*]] = ashr exact i64 [[TMP15]], 32 -; CHECK-NEXT: [[TMP20:%.*]] = ashr exact i64 [[TMP16]], 32 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP16]] ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP17]] ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP19]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP20]] -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: store i16 [[TMP23]], ptr [[TMP19]], align 2 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[TMP6]], i32 1 +; CHECK-NEXT: store i16 [[TMP24]], ptr [[TMP20]], align 2 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[TMP6]], i32 2 ; CHECK-NEXT: store i16 [[TMP25]], ptr [[TMP21]], align 2 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[TMP6]], i32 3 ; CHECK-NEXT: store i16 [[TMP26]], ptr [[TMP22]], align 2 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[TMP8]], i32 2 -; CHECK-NEXT: store i16 [[TMP27]], ptr [[TMP23]], align 2 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[TMP8]], i32 3 -; CHECK-NEXT: store i16 [[TMP28]], ptr [[TMP24]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_INC1286_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -117,35 +115,33 @@ ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 2 ; CHECK-NEXT: [[TMP7:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT]] to <4 x i16> ; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i16> [[WIDE_LOAD]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = zext <4 x i16> [[TMP8]] to <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i32> [[TMP9]] to <4 x i16> -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 ; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 ; CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 -; CHECK-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP17:%.*]] = ashr exact i64 [[TMP13]], 32 +; CHECK-NEXT: [[TMP18:%.*]] = ashr exact i64 [[TMP14]], 32 ; CHECK-NEXT: [[TMP19:%.*]] = ashr exact i64 [[TMP15]], 32 ; CHECK-NEXT: [[TMP20:%.*]] = ashr exact i64 [[TMP16]], 32 -; CHECK-NEXT: [[TMP21:%.*]] = ashr exact i64 [[TMP17]], 32 -; CHECK-NEXT: [[TMP22:%.*]] = ashr exact i64 [[TMP18]], 32 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP17]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP18]] ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP19]] ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP20]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP21]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP22]] -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[TMP8]], i32 0 +; CHECK-NEXT: store i16 [[TMP25]], ptr [[TMP21]], align 2 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[TMP8]], i32 1 +; CHECK-NEXT: store i16 [[TMP26]], ptr [[TMP22]], align 2 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[TMP8]], i32 2 ; CHECK-NEXT: store i16 [[TMP27]], ptr [[TMP23]], align 2 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[TMP8]], i32 3 ; CHECK-NEXT: store i16 [[TMP28]], ptr [[TMP24]], align 2 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i16> [[TMP10]], i32 2 -; CHECK-NEXT: store i16 [[TMP29]], ptr [[TMP25]], align 2 -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i16> [[TMP10]], i32 3 -; CHECK-NEXT: store i16 [[TMP30]], ptr [[TMP26]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_INC1286_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll b/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll --- a/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll @@ -75,15 +75,15 @@ ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[LEN]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[ARG1:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = trunc [[BROADCAST_SPLAT]] to ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = trunc [[BROADCAST_SPLAT]] to -; CHECK-NEXT: [[TMP6:%.*]] = xor [[WIDE_LOAD]], [[TMP5]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP5]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = xor [[WIDE_LOAD]], [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = mul [[TMP6]], [[WIDE_LOAD]] -; CHECK-NEXT: store [[TMP7]], ptr [[TMP4]], align 1 +; CHECK-NEXT: store [[TMP7]], ptr [[TMP5]], align 1 ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] diff --git a/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll b/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll --- a/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll +++ b/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll @@ -328,16 +328,12 @@ ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1 ; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i16> -; CHECK-NEXT: [[TMP7:%.*]] = lshr <4 x i16> [[TMP6]], -; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i16> [[TMP7]] to <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i32> [[TMP8]] to <4 x i16> -; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i16> [[TMP9]] to <4 x i8> -; CHECK-NEXT: store <4 x i8> [[TMP10]], ptr [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i16> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i16> [[TMP5]] to <4 x i8> +; CHECK-NEXT: store <4 x i8> [[TMP6]], ptr [[TMP3]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100 -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: