diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -623,10 +623,6 @@ /// the block that was created for it. void sinkScalarOperands(Instruction *PredInst); - /// Shrinks vector element sizes to the smallest bitwidth they can be legally - /// represented as. - void truncateToMinimalBitwidths(VPTransformState &State); - /// Returns (and creates if needed) the trip count of the widened loop. Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); @@ -3386,151 +3382,8 @@ return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; } -void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { - // For every instruction `I` in MinBWs, truncate the operands, create a - // truncated version of `I` and reextend its result. InstCombine runs - // later and will remove any ext/trunc pairs. - SmallPtrSet Erased; - for (const auto &KV : Cost->getMinimalBitwidths()) { - // If the value wasn't vectorized, we must maintain the original scalar - // type. The absence of the value from State indicates that it - // wasn't vectorized. - // FIXME: Should not rely on getVPValue at this point. - VPValue *Def = State.Plan->getVPValue(KV.first, true); - if (!State.hasAnyVectorValue(Def)) - continue; - // If the instruction is defined outside the loop, only update the first - // part; the first part will be re-used for all other parts. - unsigned UFToUse = OrigLoop->contains(KV.first) ? UF : 1; - for (unsigned Part = 0; Part < UFToUse; ++Part) { - Value *I = State.get(Def, Part); - if (Erased.count(I) || I->use_empty() || !isa(I)) - continue; - Type *OriginalTy = I->getType(); - Type *ScalarTruncatedTy = - IntegerType::get(OriginalTy->getContext(), KV.second); - auto *TruncatedTy = VectorType::get( - ScalarTruncatedTy, cast(OriginalTy)->getElementCount()); - if (TruncatedTy == OriginalTy) - continue; - - IRBuilder<> B(cast(I)); - auto ShrinkOperand = [&](Value *V) -> Value * { - if (auto *ZI = dyn_cast(V)) - if (ZI->getSrcTy() == TruncatedTy) - return ZI->getOperand(0); - return B.CreateZExtOrTrunc(V, TruncatedTy); - }; - - // The actual instruction modification depends on the instruction type, - // unfortunately. - Value *NewI = nullptr; - if (auto *BO = dyn_cast(I)) { - Value *Op0 = ShrinkOperand(BO->getOperand(0)); - Value *Op1 = ShrinkOperand(BO->getOperand(1)); - NewI = B.CreateBinOp(BO->getOpcode(), Op0, Op1); - - // Any wrapping introduced by shrinking this operation shouldn't be - // considered undefined behavior. So, we can't unconditionally copy - // arithmetic wrapping flags to NewI. - cast(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); - } else if (auto *CI = dyn_cast(I)) { - Value *Op0 = ShrinkOperand(BO->getOperand(0)); - Value *Op1 = ShrinkOperand(BO->getOperand(1)); - NewI = B.CreateICmp(CI->getPredicate(), Op0, Op1); - } else if (auto *SI = dyn_cast(I)) { - Value *TV = ShrinkOperand(SI->getTrueValue()); - Value *FV = ShrinkOperand(SI->getFalseValue()); - NewI = B.CreateSelect(SI->getCondition(), TV, FV); - } else if (auto *CI = dyn_cast(I)) { - switch (CI->getOpcode()) { - default: - llvm_unreachable("Unhandled cast!"); - case Instruction::Trunc: - NewI = ShrinkOperand(CI->getOperand(0)); - break; - case Instruction::SExt: - NewI = B.CreateSExtOrTrunc( - CI->getOperand(0), - smallestIntegerVectorType(OriginalTy, TruncatedTy)); - break; - case Instruction::ZExt: - NewI = B.CreateZExtOrTrunc( - CI->getOperand(0), - smallestIntegerVectorType(OriginalTy, TruncatedTy)); - break; - } - } else if (auto *SI = dyn_cast(I)) { - auto Elements0 = - cast(SI->getOperand(0)->getType())->getElementCount(); - auto *O0 = B.CreateZExtOrTrunc( - SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); - auto Elements1 = - cast(SI->getOperand(1)->getType())->getElementCount(); - auto *O1 = B.CreateZExtOrTrunc( - SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); - - NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); - } else if (isa(I) || isa(I)) { - // Don't do anything with the operands, just extend the result. - continue; - } else if (auto *IE = dyn_cast(I)) { - auto Elements = - cast(IE->getOperand(0)->getType())->getElementCount(); - auto *O0 = B.CreateZExtOrTrunc( - IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); - auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); - NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); - } else if (auto *EE = dyn_cast(I)) { - auto Elements = - cast(EE->getOperand(0)->getType())->getElementCount(); - auto *O0 = B.CreateZExtOrTrunc( - EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); - NewI = B.CreateExtractElement(O0, EE->getOperand(2)); - } else { - // If we don't know what to do, be conservative and don't do anything. - continue; - } - - // Lastly, extend the result. - NewI->takeName(cast(I)); - Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); - I->replaceAllUsesWith(Res); - cast(I)->eraseFromParent(); - Erased.insert(I); - State.reset(Def, Res, Part); - } - } - - // We'll have created a bunch of ZExts that are now parentless. Clean up. - for (const auto &KV : Cost->getMinimalBitwidths()) { - // If the value wasn't vectorized, we must maintain the original scalar - // type. The absence of the value from State indicates that it - // wasn't vectorized. - // FIXME: Should not rely on getVPValue at this point. - VPValue *Def = State.Plan->getVPValue(KV.first, true); - if (!State.hasAnyVectorValue(Def)) - continue; - unsigned UFToUse = OrigLoop->contains(KV.first) ? UF : 1; - for (unsigned Part = 0; Part < UFToUse; ++Part) { - Value *I = State.get(Def, Part); - ZExtInst *Inst = dyn_cast(I); - if (Inst && Inst->use_empty()) { - Value *NewI = Inst->getOperand(0); - Inst->eraseFromParent(); - State.reset(Def, NewI, Part); - } - } - } -} - void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, VPlan &Plan) { - // Insert truncates and extends for any truncated instructions as hints to - // InstCombine. - if (VF.isVector()) - truncateToMinimalBitwidths(State); - // Fix widened non-induction PHIs by setting up the PHI operands. if (EnableVPlanNativePath) fixNonInductionPHIs(Plan, State); @@ -8671,7 +8524,7 @@ VFRange SubRange = {VF, MaxVFTimes2}; if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) { // Now optimize the initial VPlan. - VPlanTransforms::optimize(*Plan, *PSE.getSE()); + VPlanTransforms::optimize(*Plan, *PSE.getSE(), CM.getMinimalBitwidths()); assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); VPlans.push_back(std::move(Plan)); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -274,10 +274,6 @@ I->second[Part]; } - bool hasAnyVectorValue(VPValue *Def) const { - return Data.PerPartOutput.contains(Def); - } - bool hasScalarValue(VPValue *Def, VPIteration Instance) { auto I = Data.PerPartScalars.find(Def); if (I == Data.PerPartScalars.end()) @@ -2706,6 +2702,8 @@ VPBasicBlock *getPreheader() { return Preheader; } const VPBasicBlock *getPreheader() const { return Preheader; } + ArrayRef getLiveIns() const { return VPLiveInsToFree; } + private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -59,7 +59,8 @@ /// Apply VPlan-to-VPlan optimizations to \p Plan, including induction recipe /// optimizations, dead recipe removal, replicate region optimizations and /// block merging. - static void optimize(VPlan &Plan, ScalarEvolution &SE); + static void optimize(VPlan &Plan, ScalarEvolution &SE, + const MapVector &MinBWs); /// Wrap predicated VPReplicateRecipes with a mask operand in an if-then /// region block and remove the mask operand. Optimize the created regions by @@ -79,6 +80,12 @@ bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck); + /// Insert truncates and extends for any truncated instructions as hints to + /// InstCombine. + static void + truncateToMinimalBitwidths(VPlan &Plan, + const MapVector &MinBWs); + private: /// Remove redundant VPBasicBlocks by merging them into their predecessor if /// the predecessor has a single successor. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -868,12 +868,130 @@ } } -void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) { +void VPlanTransforms::truncateToMinimalBitwidths( + VPlan &Plan, const MapVector &MinBWs) { +#ifndef NDEBUG + unsigned ProcessedRecipes = 0; +#endif + VPBasicBlock *PH = + cast(Plan.getVectorLoopRegion()->getSinglePredecessor()); + + // First truncate live-ins that represent relevant instructions. + for (VPValue *VPV : Plan.getLiveIns()) { + auto *LiveInInst = dyn_cast(VPV->getLiveInIRValue()); + unsigned NewResSizeInBits = MinBWs.lookup(LiveInInst); + if (!LiveInInst || !NewResSizeInBits) + continue; + + Type *ResTy = LiveInInst->getType(); + if (!ResTy->isIntegerTy()) + continue; + + LLVMContext &Ctx = ResTy->getContext(); + auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits); + auto *Shrunk = new VPWidenCastRecipe(Instruction::Trunc, VPV, NewResTy); + PH->appendRecipe(Shrunk); + VPV->replaceAllUsesWith(Shrunk); + Shrunk->setOperand(0, VPV); +#ifndef NDEBUG + ProcessedRecipes++; +#endif + } + + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_deep(Plan.getEntry()))) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + if (auto *Mem = dyn_cast(&R)) { +#ifndef NDEBUG + ProcessedRecipes += MinBWs.count(&Mem->getIngredient()); +#endif + continue; + } + if (!isa(&R)) + continue; + + VPValue *ResultVPV = R.getVPSingleValue(); + auto *UI = cast_or_null(ResultVPV->getUnderlyingValue()); + unsigned NewResSizeInBits = MinBWs.lookup(UI); + if (!UI || !NewResSizeInBits) + continue; + +#ifndef NDEBUG + ProcessedRecipes++; +#endif + + // Only widen recipes are handled at the moment, but there may be entries + // for replicate recipes in MinBWs. Skip those here, after incrementing + // ProcessedRecipes. + if (isa(&R)) + continue; + unsigned ResSizeInBits = getTypeSizeInBits(ResultVPV); + Type *ResTy = UI->getType(); + assert(ResTy->isIntegerTy() && "only integer types supported"); + if (ResSizeInBits == NewResSizeInBits) + continue; + + LLVMContext &Ctx = ResTy->getContext(); + auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits); + + // Try to replace wider SExt/ZExts with narrower ones if possible. + if (auto *VPC = dyn_cast(&R)) { + unsigned Opc = VPC->getOpcode(); + if (Opc == Instruction::SExt || Opc == Instruction::ZExt) { + assert(ResSizeInBits > NewResSizeInBits && "Nothing to shrink?"); + // SExt/Zext is redundant - stick with its operand. + Instruction::CastOps Opcode = VPC->getOpcode(); + VPValue *Op = R.getOperand(0); + if (getTypeSizeInBits(Op) > NewResSizeInBits) + Opcode = Instruction::Trunc; + auto *C = new VPWidenCastRecipe(Opcode, Op, NewResTy); + C->insertBefore(VPC); + VPC->replaceAllUsesWith(C); + continue; + } + } + + // Shrink operands by introducing truncates as needed. + unsigned StartIdx = isa(&R) ? 1 : 0; + for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) { + auto *Op = R.getOperand(Idx); + unsigned OpSizeInBits = getTypeSizeInBits(Op); + if (OpSizeInBits == NewResSizeInBits) + continue; + assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate"); + auto *Shrunk = new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy); + Shrunk->insertBefore(&R); + R.setOperand(Idx, Shrunk); + } + + if (auto *VPW = dyn_cast(&R)) + VPW->dropPoisonGeneratingFlags(); + + // Extend result to original width. + auto *Ext = new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, ResTy); + Ext->insertAfter(&R); + ResultVPV->replaceAllUsesWith(Ext); + Ext->setOperand(0, ResultVPV); + } + } + + assert(MinBWs.size() == ProcessedRecipes && + "some entries in MinBWs haven't been processed"); +} + +void VPlanTransforms::optimize( + VPlan &Plan, ScalarEvolution &SE, + const MapVector &MinBWs) { removeRedundantCanonicalIVs(Plan); removeRedundantInductionCasts(Plan); - optimizeInductions(Plan, SE); + + if (!Plan.hasVF(ElementCount::getFixed(1))) + truncateToMinimalBitwidths(Plan, MinBWs); + simplifyRecipes(Plan); + removeDeadRecipes(Plan); createAndOptimizeReplicateRegions(Plan); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll @@ -28,25 +28,24 @@ ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i16> -; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = mul nuw <16 x i16> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i16> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = mul nuw <16 x i16> [[TMP4]], [[TMP2]] ; CHECK-NEXT: [[TMP6:%.*]] = lshr <16 x i16> [[TMP5]], ; CHECK-NEXT: [[TMP7:%.*]] = trunc <16 x i16> [[TMP6]] to <16 x i8> -; CHECK-NEXT: store <16 x i8> [[TMP7]], ptr [[TMP2]], align 1 +; CHECK-NEXT: store <16 x i8> [[TMP7]], ptr [[TMP3]], align 1 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 ; CHECK-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i16> -; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i16> -; CHECK-NEXT: [[TMP11:%.*]] = mul nuw <16 x i16> [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = lshr <16 x i16> [[TMP11]], -; CHECK-NEXT: [[TMP13:%.*]] = trunc <16 x i16> [[TMP12]] to <16 x i8> -; CHECK-NEXT: store <16 x i8> [[TMP13]], ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw <16 x i16> [[TMP9]], [[TMP2]] +; CHECK-NEXT: [[TMP11:%.*]] = lshr <16 x i16> [[TMP10]], +; CHECK-NEXT: [[TMP12:%.*]] = trunc <16 x i16> [[TMP11]] to <16 x i8> +; CHECK-NEXT: store <16 x i8> [[TMP12]], ptr [[TMP8]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -60,27 +59,26 @@ ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: ; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX7]] -; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i8>, ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX7]] +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i8>, ptr [[TMP14]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = zext <8 x i8> [[WIDE_LOAD8]] to <8 x i16> ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX7]] ; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <8 x i8>, ptr [[TMP16]], align 1 ; CHECK-NEXT: [[TMP17:%.*]] = zext <8 x i8> [[WIDE_LOAD9]] to <8 x i16> -; CHECK-NEXT: [[TMP18:%.*]] = zext <8 x i8> [[WIDE_LOAD8]] to <8 x i16> -; CHECK-NEXT: [[TMP19:%.*]] = mul nuw <8 x i16> [[TMP17]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = lshr <8 x i16> [[TMP19]], -; CHECK-NEXT: [[TMP21:%.*]] = trunc <8 x i16> [[TMP20]] to <8 x i8> -; CHECK-NEXT: store <8 x i8> [[TMP21]], ptr [[TMP16]], align 1 -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX7]] -; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i8>, ptr [[TMP22]], align 1 -; CHECK-NEXT: [[TMP23:%.*]] = zext <8 x i8> [[WIDE_LOAD10]] to <8 x i16> -; CHECK-NEXT: [[TMP24:%.*]] = zext <8 x i8> [[WIDE_LOAD8]] to <8 x i16> -; CHECK-NEXT: [[TMP25:%.*]] = mul nuw <8 x i16> [[TMP23]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = lshr <8 x i16> [[TMP25]], -; CHECK-NEXT: [[TMP27:%.*]] = trunc <8 x i16> [[TMP26]] to <8 x i8> -; CHECK-NEXT: store <8 x i8> [[TMP27]], ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP18:%.*]] = mul nuw <8 x i16> [[TMP17]], [[TMP15]] +; CHECK-NEXT: [[TMP19:%.*]] = lshr <8 x i16> [[TMP18]], +; CHECK-NEXT: [[TMP20:%.*]] = trunc <8 x i16> [[TMP19]] to <8 x i8> +; CHECK-NEXT: store <8 x i8> [[TMP20]], ptr [[TMP16]], align 1 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX7]] +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i8>, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = zext <8 x i8> [[WIDE_LOAD10]] to <8 x i16> +; CHECK-NEXT: [[TMP23:%.*]] = mul nuw <8 x i16> [[TMP22]], [[TMP15]] +; CHECK-NEXT: [[TMP24:%.*]] = lshr <8 x i16> [[TMP23]], +; CHECK-NEXT: [[TMP25:%.*]] = trunc <8 x i16> [[TMP24]] to <8 x i8> +; CHECK-NEXT: store <8 x i8> [[TMP25]], ptr [[TMP21]], align 1 ; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX7]], 8 -; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC5]] -; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[N_VEC5]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -94,18 +92,18 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP29]] to i32 +; CHECK-NEXT: [[TMP27:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP27]] to i32 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP30:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 -; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[TMP30]] to i32 +; CHECK-NEXT: [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[TMP28]] to i32 ; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[CONV3]], [[CONV]] ; CHECK-NEXT: [[SHR_26:%.*]] = lshr i32 [[MUL]], 8 ; CHECK-NEXT: [[CONV4:%.*]] = trunc i32 [[SHR_26]] to i8 ; CHECK-NEXT: store i8 [[CONV4]], ptr [[ARRAYIDX2]], align 1 ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[ARRAYIDX8]], align 1 -; CHECK-NEXT: [[CONV9:%.*]] = zext i8 [[TMP31]] to i32 +; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX8]], align 1 +; CHECK-NEXT: [[CONV9:%.*]] = zext i8 [[TMP29]] to i32 ; CHECK-NEXT: [[MUL10:%.*]] = mul nuw nsw i32 [[CONV9]], [[CONV]] ; CHECK-NEXT: [[SHR11_27:%.*]] = lshr i32 [[MUL10]], 8 ; CHECK-NEXT: [[CONV12:%.*]] = trunc i32 [[SHR11_27]] to i8 @@ -158,54 +156,57 @@ ; CHECK-LABEL: define void @test_shrink_zext_in_preheader ; CHECK-SAME: (ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[A:%.*]], i16 [[B:%.*]]) { ; CHECK-NEXT: iter.check: +; CHECK-NEXT: [[CONV10:%.*]] = zext i16 [[B]] to i32 ; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[CONV10]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i16> undef, i16 [[B]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i16> [[TMP0]], <16 x i16> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT]] to <16 x i16> +; CHECK-NEXT: [[TMP1:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT]] to <16 x i16> +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i32> poison, i32 [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT1]], <16 x i32> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT]] to <16 x i16> -; CHECK-NEXT: [[TMP2:%.*]] = mul <16 x i16> [[BROADCAST_SPLAT2]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT]] to <16 x i16> -; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i16> [[BROADCAST_SPLAT2]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = lshr <16 x i16> [[TMP2]], +; CHECK-NEXT: [[TMP2:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT2]] to <16 x i16> +; CHECK-NEXT: [[TMP3:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT2]] to <16 x i16> +; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i16> [[TMP2]], [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = mul <16 x i16> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP6:%.*]] = lshr <16 x i16> [[TMP4]], -; CHECK-NEXT: [[TMP7:%.*]] = trunc <16 x i16> [[TMP5]] to <16 x i8> +; CHECK-NEXT: [[TMP7:%.*]] = lshr <16 x i16> [[TMP5]], ; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i16> [[TMP6]] to <16 x i8> -; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[INDEX]] to i64 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP9]] -; CHECK-NEXT: store <16 x i8> [[TMP7]], ptr [[TMP10]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 16 +; CHECK-NEXT: [[TMP9:%.*]] = trunc <16 x i16> [[TMP7]] to <16 x i8> +; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[INDEX]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP10]] ; CHECK-NEXT: store <16 x i8> [[TMP8]], ptr [[TMP11]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 16 +; CHECK-NEXT: store <16 x i8> [[TMP9]], ptr [[TMP12]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i16> undef, i16 [[B]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i16> undef, i16 [[B]], i64 0 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX4:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP14:%.*]] = trunc i32 [[A]] to i16 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i16> undef, i16 [[TMP14]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = mul <8 x i16> [[TMP15]], [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = lshr <8 x i16> [[TMP16]], -; CHECK-NEXT: [[TMP18:%.*]] = trunc <8 x i16> [[TMP17]] to <8 x i8> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x i8> [[TMP18]], <8 x i8> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = sext i32 [[INDEX4]] to i64 -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP20]] -; CHECK-NEXT: store <8 x i8> [[TMP19]], ptr [[TMP21]], align 1 -; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i32 [[INDEX4]], 8 -; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT9]], 1000 -; CHECK-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[INDEX6:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP15:%.*]] = trunc i32 [[A]] to i16 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> undef, i16 [[TMP15]], i64 0 +; CHECK-NEXT: [[TMP17:%.*]] = mul <8 x i16> [[TMP16]], [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = lshr <8 x i16> [[TMP17]], +; CHECK-NEXT: [[TMP19:%.*]] = trunc <8 x i16> [[TMP18]] to <8 x i8> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i8> [[TMP19]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = sext i32 [[INDEX6]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP21]] +; CHECK-NEXT: store <8 x i8> [[TMP20]], ptr [[TMP22]], align 1 +; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i32 [[INDEX6]], 8 +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT9]], 1000 +; CHECK-NEXT: br i1 [[TMP23]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -307,17 +307,15 @@ ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i16> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = trunc <16 x i32> [[TMP5]] to <16 x i16> -; CHECK-NEXT: [[TMP7:%.*]] = add <16 x i16> [[TMP6]], -; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i16> [[TMP7]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = trunc <16 x i32> [[TMP8]] to <16 x i16> -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0 -; CHECK-NEXT: store <16 x i16> [[TMP9]], ptr [[TMP11]], align 2 +; CHECK-NEXT: [[TMP5:%.*]] = add <16 x i16> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = zext <16 x i16> [[TMP5]] to <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = trunc <16 x i32> [[TMP6]] to <16 x i16> +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store <16 x i16> [[TMP7]], ptr [[TMP9]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -332,22 +330,20 @@ ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: ; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX5]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i8>, ptr [[TMP15]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = zext <8 x i8> [[WIDE_LOAD6]] to <8 x i16> -; CHECK-NEXT: [[TMP17:%.*]] = zext <8 x i16> [[TMP16]] to <8 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = trunc <8 x i32> [[TMP17]] to <8 x i16> -; CHECK-NEXT: [[TMP19:%.*]] = add <8 x i16> [[TMP18]], -; CHECK-NEXT: [[TMP20:%.*]] = zext <8 x i16> [[TMP19]] to <8 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = trunc <8 x i32> [[TMP20]] to <8 x i16> -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[TMP22]], i32 0 -; CHECK-NEXT: store <8 x i16> [[TMP21]], ptr [[TMP23]], align 2 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX5]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i8>, ptr [[TMP13]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = zext <8 x i8> [[WIDE_LOAD6]] to <8 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = add <8 x i16> [[TMP14]], +; CHECK-NEXT: [[TMP16:%.*]] = zext <8 x i16> [[TMP15]] to <8 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = trunc <8 x i32> [[TMP16]] to <8 x i16> +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[TMP18]], i32 0 +; CHECK-NEXT: store <8 x i16> [[TMP17]], ptr [[TMP19]], align 2 ; CHECK-NEXT: [[INDEX_NEXT7]] = add nuw i64 [[INDEX5]], 8 -; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP24]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP20]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N4]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -361,8 +357,8 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP25:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP25]] to i32 +; CHECK-NEXT: [[TMP21:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP21]] to i32 ; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV]], 2 ; CHECK-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDVARS_IV]] @@ -485,52 +481,48 @@ ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[CONV13]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = trunc <16 x i32> [[BROADCAST_SPLATINSERT]] to <16 x i8> -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT]] to <16 x i8> ; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i32> poison, i32 [[CONV11]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = trunc <16 x i32> [[BROADCAST_SPLATINSERT2]] to <16 x i8> -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT3]] to <16 x i32> +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT2]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT3]] to <16 x i8> ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-NEXT: [[TMP9:%.*]] = shl <16 x i8> [[TMP8]], +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = shl <16 x i8> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[TMP6]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i32> [[TMP7]] to <16 x i8> +; CHECK-NEXT: [[TMP9:%.*]] = add <16 x i8> [[TMP8]], ; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = trunc <16 x i32> [[TMP10]] to <16 x i8> -; CHECK-NEXT: [[TMP12:%.*]] = add <16 x i8> [[TMP11]], -; CHECK-NEXT: [[TMP13:%.*]] = zext <16 x i8> [[TMP12]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i8> [[TMP8]], +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i8> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP12:%.*]] = zext <16 x i8> [[TMP11]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = trunc <16 x i32> [[TMP12]] to <16 x i8> +; CHECK-NEXT: [[TMP14:%.*]] = mul <16 x i8> [[TMP13]], ; CHECK-NEXT: [[TMP15:%.*]] = zext <16 x i8> [[TMP14]] to <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = trunc <16 x i32> [[TMP15]] to <16 x i8> -; CHECK-NEXT: [[TMP17:%.*]] = mul <16 x i8> [[TMP16]], +; CHECK-NEXT: [[TMP16:%.*]] = trunc <16 x i32> [[TMP10]] to <16 x i8> +; CHECK-NEXT: [[TMP17:%.*]] = and <16 x i8> [[TMP16]], [[TMP1]] ; CHECK-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = trunc <16 x i32> [[TMP13]] to <16 x i8> -; CHECK-NEXT: [[TMP20:%.*]] = trunc <16 x i32> [[TMP2]] to <16 x i8> -; CHECK-NEXT: [[TMP21:%.*]] = and <16 x i8> [[TMP19]], [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = zext <16 x i8> [[TMP21]] to <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = trunc <16 x i32> [[TMP18]] to <16 x i8> -; CHECK-NEXT: [[TMP24:%.*]] = and <16 x i8> [[TMP23]], -; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = trunc <16 x i32> [[TMP25]] to <16 x i8> -; CHECK-NEXT: [[TMP27:%.*]] = trunc <16 x i32> [[TMP4]] to <16 x i8> -; CHECK-NEXT: [[TMP28:%.*]] = xor <16 x i8> [[TMP26]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = zext <16 x i8> [[TMP28]] to <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = trunc <16 x i32> [[TMP29]] to <16 x i8> -; CHECK-NEXT: [[TMP31:%.*]] = trunc <16 x i32> [[TMP22]] to <16 x i8> -; CHECK-NEXT: [[TMP32:%.*]] = mul <16 x i8> [[TMP30]], [[TMP31]] -; CHECK-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = trunc <16 x i32> [[TMP33]] to <16 x i8> -; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i32 0 -; CHECK-NEXT: store <16 x i8> [[TMP34]], ptr [[TMP36]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = trunc <16 x i32> [[TMP15]] to <16 x i8> +; CHECK-NEXT: [[TMP20:%.*]] = and <16 x i8> [[TMP19]], +; CHECK-NEXT: [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = trunc <16 x i32> [[TMP21]] to <16 x i8> +; CHECK-NEXT: [[TMP23:%.*]] = xor <16 x i8> [[TMP22]], [[TMP2]] +; CHECK-NEXT: [[TMP24:%.*]] = zext <16 x i8> [[TMP23]] to <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = trunc <16 x i32> [[TMP24]] to <16 x i8> +; CHECK-NEXT: [[TMP26:%.*]] = trunc <16 x i32> [[TMP18]] to <16 x i8> +; CHECK-NEXT: [[TMP27:%.*]] = mul <16 x i8> [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = zext <16 x i8> [[TMP27]] to <16 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = trunc <16 x i32> [[TMP28]] to <16 x i8> +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0 +; CHECK-NEXT: store <16 x i8> [[TMP29]], ptr [[TMP31]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -542,53 +534,49 @@ ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 8 ; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF4]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x i32> poison, i32 [[CONV13]], i64 0 -; CHECK-NEXT: [[TMP38:%.*]] = trunc <8 x i32> [[BROADCAST_SPLATINSERT8]] to <8 x i8> -; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <8 x i8> [[TMP38]], <8 x i8> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP39:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT9]] to <8 x i32> -; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <8 x i32> poison, i32 [[CONV11]], i64 0 -; CHECK-NEXT: [[TMP40:%.*]] = trunc <8 x i32> [[BROADCAST_SPLATINSERT10]] to <8 x i8> -; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <8 x i8> [[TMP40]], <8 x i8> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP41:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT11]] to <8 x i32> +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i32> poison, i32 [[CONV13]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT7]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP33:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT8]] to <8 x i8> +; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <8 x i32> poison, i32 [[CONV11]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT9]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP34:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT10]] to <8 x i8> ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP42:%.*]] = add i64 [[INDEX7]], 0 -; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP42]] -; CHECK-NEXT: [[TMP44:%.*]] = getelementptr inbounds i8, ptr [[TMP43]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = load <8 x i8>, ptr [[TMP44]], align 1 -; CHECK-NEXT: [[TMP46:%.*]] = shl <8 x i8> [[TMP45]], +; CHECK-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP35:%.*]] = add i64 [[INDEX11]], 0 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP35]] +; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP36]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, ptr [[TMP37]], align 1 +; CHECK-NEXT: [[TMP38:%.*]] = shl <8 x i8> [[WIDE_LOAD12]], +; CHECK-NEXT: [[TMP39:%.*]] = zext <8 x i8> [[TMP38]] to <8 x i32> +; CHECK-NEXT: [[TMP40:%.*]] = trunc <8 x i32> [[TMP39]] to <8 x i8> +; CHECK-NEXT: [[TMP41:%.*]] = add <8 x i8> [[TMP40]], +; CHECK-NEXT: [[TMP42:%.*]] = zext <8 x i8> [[TMP41]] to <8 x i32> +; CHECK-NEXT: [[TMP43:%.*]] = or <8 x i8> [[WIDE_LOAD12]], +; CHECK-NEXT: [[TMP44:%.*]] = zext <8 x i8> [[TMP43]] to <8 x i32> +; CHECK-NEXT: [[TMP45:%.*]] = trunc <8 x i32> [[TMP44]] to <8 x i8> +; CHECK-NEXT: [[TMP46:%.*]] = mul <8 x i8> [[TMP45]], ; CHECK-NEXT: [[TMP47:%.*]] = zext <8 x i8> [[TMP46]] to <8 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = trunc <8 x i32> [[TMP47]] to <8 x i8> -; CHECK-NEXT: [[TMP49:%.*]] = add <8 x i8> [[TMP48]], +; CHECK-NEXT: [[TMP48:%.*]] = trunc <8 x i32> [[TMP42]] to <8 x i8> +; CHECK-NEXT: [[TMP49:%.*]] = and <8 x i8> [[TMP48]], [[TMP33]] ; CHECK-NEXT: [[TMP50:%.*]] = zext <8 x i8> [[TMP49]] to <8 x i32> -; CHECK-NEXT: [[TMP51:%.*]] = or <8 x i8> [[TMP45]], -; CHECK-NEXT: [[TMP52:%.*]] = zext <8 x i8> [[TMP51]] to <8 x i32> -; CHECK-NEXT: [[TMP53:%.*]] = trunc <8 x i32> [[TMP52]] to <8 x i8> -; CHECK-NEXT: [[TMP54:%.*]] = mul <8 x i8> [[TMP53]], -; CHECK-NEXT: [[TMP55:%.*]] = zext <8 x i8> [[TMP54]] to <8 x i32> -; CHECK-NEXT: [[TMP56:%.*]] = trunc <8 x i32> [[TMP50]] to <8 x i8> -; CHECK-NEXT: [[TMP57:%.*]] = trunc <8 x i32> [[TMP39]] to <8 x i8> -; CHECK-NEXT: [[TMP58:%.*]] = and <8 x i8> [[TMP56]], [[TMP57]] -; CHECK-NEXT: [[TMP59:%.*]] = zext <8 x i8> [[TMP58]] to <8 x i32> -; CHECK-NEXT: [[TMP60:%.*]] = trunc <8 x i32> [[TMP55]] to <8 x i8> -; CHECK-NEXT: [[TMP61:%.*]] = and <8 x i8> [[TMP60]], -; CHECK-NEXT: [[TMP62:%.*]] = zext <8 x i8> [[TMP61]] to <8 x i32> -; CHECK-NEXT: [[TMP63:%.*]] = trunc <8 x i32> [[TMP62]] to <8 x i8> -; CHECK-NEXT: [[TMP64:%.*]] = trunc <8 x i32> [[TMP41]] to <8 x i8> -; CHECK-NEXT: [[TMP65:%.*]] = xor <8 x i8> [[TMP63]], [[TMP64]] -; CHECK-NEXT: [[TMP66:%.*]] = zext <8 x i8> [[TMP65]] to <8 x i32> -; CHECK-NEXT: [[TMP67:%.*]] = trunc <8 x i32> [[TMP66]] to <8 x i8> -; CHECK-NEXT: [[TMP68:%.*]] = trunc <8 x i32> [[TMP59]] to <8 x i8> -; CHECK-NEXT: [[TMP69:%.*]] = mul <8 x i8> [[TMP67]], [[TMP68]] -; CHECK-NEXT: [[TMP70:%.*]] = zext <8 x i8> [[TMP69]] to <8 x i32> -; CHECK-NEXT: [[TMP71:%.*]] = trunc <8 x i32> [[TMP70]] to <8 x i8> -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP42]] -; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[TMP72]], i32 0 -; CHECK-NEXT: store <8 x i8> [[TMP71]], ptr [[TMP73]], align 1 -; CHECK-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX7]], 8 -; CHECK-NEXT: [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC5]] -; CHECK-NEXT: br i1 [[TMP74]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: [[TMP51:%.*]] = trunc <8 x i32> [[TMP47]] to <8 x i8> +; CHECK-NEXT: [[TMP52:%.*]] = and <8 x i8> [[TMP51]], +; CHECK-NEXT: [[TMP53:%.*]] = zext <8 x i8> [[TMP52]] to <8 x i32> +; CHECK-NEXT: [[TMP54:%.*]] = trunc <8 x i32> [[TMP53]] to <8 x i8> +; CHECK-NEXT: [[TMP55:%.*]] = xor <8 x i8> [[TMP54]], [[TMP34]] +; CHECK-NEXT: [[TMP56:%.*]] = zext <8 x i8> [[TMP55]] to <8 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = trunc <8 x i32> [[TMP56]] to <8 x i8> +; CHECK-NEXT: [[TMP58:%.*]] = trunc <8 x i32> [[TMP50]] to <8 x i8> +; CHECK-NEXT: [[TMP59:%.*]] = mul <8 x i8> [[TMP57]], [[TMP58]] +; CHECK-NEXT: [[TMP60:%.*]] = zext <8 x i8> [[TMP59]] to <8 x i32> +; CHECK-NEXT: [[TMP61:%.*]] = trunc <8 x i32> [[TMP60]] to <8 x i8> +; CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP35]] +; CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[TMP62]], i32 0 +; CHECK-NEXT: store <8 x i8> [[TMP61]], ptr [[TMP63]], align 1 +; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX11]], 8 +; CHECK-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[TMP64]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -602,8 +590,8 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP75:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP75]] to i32 +; CHECK-NEXT: [[TMP65:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP65]] to i32 ; CHECK-NEXT: [[ADD:%.*]] = shl i32 [[CONV]], 4 ; CHECK-NEXT: [[CONV2:%.*]] = add nuw nsw i32 [[ADD]], 32 ; CHECK-NEXT: [[OR:%.*]] = or i32 [[CONV]], 51 @@ -673,58 +661,52 @@ ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[CONV13]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = trunc <16 x i32> [[BROADCAST_SPLATINSERT]] to <16 x i8> -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT]] to <16 x i8> ; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i32> poison, i32 [[CONV11]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = trunc <16 x i32> [[BROADCAST_SPLATINSERT2]] to <16 x i8> -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT3]] to <16 x i32> +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT2]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT3]] to <16 x i8> ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i16>, ptr [[TMP7]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i16> [[WIDE_LOAD]] to <16 x i8> -; CHECK-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = trunc <16 x i32> [[TMP9]] to <16 x i8> -; CHECK-NEXT: [[TMP11:%.*]] = shl <16 x i8> [[TMP10]], -; CHECK-NEXT: [[TMP12:%.*]] = zext <16 x i8> [[TMP11]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = trunc <16 x i32> [[TMP12]] to <16 x i8> -; CHECK-NEXT: [[TMP14:%.*]] = add <16 x i8> [[TMP13]], -; CHECK-NEXT: [[TMP15:%.*]] = zext <16 x i8> [[TMP14]] to <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = and <16 x i8> [[TMP8]], -; CHECK-NEXT: [[TMP17:%.*]] = zext <16 x i8> [[TMP16]] to <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = trunc <16 x i32> [[TMP17]] to <16 x i8> -; CHECK-NEXT: [[TMP19:%.*]] = or <16 x i8> [[TMP18]], -; CHECK-NEXT: [[TMP20:%.*]] = zext <16 x i8> [[TMP19]] to <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = trunc <16 x i32> [[TMP20]] to <16 x i8> -; CHECK-NEXT: [[TMP22:%.*]] = mul <16 x i8> [[TMP21]], -; CHECK-NEXT: [[TMP23:%.*]] = zext <16 x i8> [[TMP22]] to <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = trunc <16 x i32> [[TMP15]] to <16 x i8> -; CHECK-NEXT: [[TMP25:%.*]] = trunc <16 x i32> [[TMP2]] to <16 x i8> -; CHECK-NEXT: [[TMP26:%.*]] = and <16 x i8> [[TMP24]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = trunc <16 x i32> [[TMP23]] to <16 x i8> -; CHECK-NEXT: [[TMP29:%.*]] = and <16 x i8> [[TMP28]], -; CHECK-NEXT: [[TMP30:%.*]] = zext <16 x i8> [[TMP29]] to <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = trunc <16 x i32> [[TMP30]] to <16 x i8> -; CHECK-NEXT: [[TMP32:%.*]] = trunc <16 x i32> [[TMP4]] to <16 x i8> -; CHECK-NEXT: [[TMP33:%.*]] = xor <16 x i8> [[TMP31]], [[TMP32]] -; CHECK-NEXT: [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32> -; CHECK-NEXT: [[TMP35:%.*]] = trunc <16 x i32> [[TMP34]] to <16 x i8> -; CHECK-NEXT: [[TMP36:%.*]] = trunc <16 x i32> [[TMP27]] to <16 x i8> -; CHECK-NEXT: [[TMP37:%.*]] = mul <16 x i8> [[TMP35]], [[TMP36]] -; CHECK-NEXT: [[TMP38:%.*]] = zext <16 x i8> [[TMP37]] to <16 x i32> -; CHECK-NEXT: [[TMP39:%.*]] = trunc <16 x i32> [[TMP38]] to <16 x i8> -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds i8, ptr [[TMP40]], i32 0 -; CHECK-NEXT: store <16 x i8> [[TMP39]], ptr [[TMP41]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i16>, ptr [[TMP5]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = trunc <16 x i16> [[WIDE_LOAD]] to <16 x i8> +; CHECK-NEXT: [[TMP7:%.*]] = shl <16 x i8> [[TMP6]], +; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[TMP7]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = trunc <16 x i32> [[TMP8]] to <16 x i8> +; CHECK-NEXT: [[TMP10:%.*]] = add <16 x i8> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = and <16 x i8> [[TMP6]], +; CHECK-NEXT: [[TMP13:%.*]] = zext <16 x i8> [[TMP12]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = trunc <16 x i32> [[TMP13]] to <16 x i8> +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i8> [[TMP14]], +; CHECK-NEXT: [[TMP16:%.*]] = zext <16 x i8> [[TMP15]] to <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = trunc <16 x i32> [[TMP16]] to <16 x i8> +; CHECK-NEXT: [[TMP18:%.*]] = mul <16 x i8> [[TMP17]], +; CHECK-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = trunc <16 x i32> [[TMP11]] to <16 x i8> +; CHECK-NEXT: [[TMP21:%.*]] = and <16 x i8> [[TMP20]], [[TMP1]] +; CHECK-NEXT: [[TMP22:%.*]] = zext <16 x i8> [[TMP21]] to <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = trunc <16 x i32> [[TMP19]] to <16 x i8> +; CHECK-NEXT: [[TMP24:%.*]] = and <16 x i8> [[TMP23]], +; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = trunc <16 x i32> [[TMP25]] to <16 x i8> +; CHECK-NEXT: [[TMP27:%.*]] = xor <16 x i8> [[TMP26]], [[TMP2]] +; CHECK-NEXT: [[TMP28:%.*]] = zext <16 x i8> [[TMP27]] to <16 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = trunc <16 x i32> [[TMP28]] to <16 x i8> +; CHECK-NEXT: [[TMP30:%.*]] = trunc <16 x i32> [[TMP22]] to <16 x i8> +; CHECK-NEXT: [[TMP31:%.*]] = mul <16 x i8> [[TMP29]], [[TMP30]] +; CHECK-NEXT: [[TMP32:%.*]] = zext <16 x i8> [[TMP31]] to <16 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = trunc <16 x i32> [[TMP32]] to <16 x i8> +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP34]], i32 0 +; CHECK-NEXT: store <16 x i8> [[TMP33]], ptr [[TMP35]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -736,59 +718,53 @@ ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 8 ; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF4]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <8 x i32> poison, i32 [[CONV13]], i64 0 -; CHECK-NEXT: [[TMP43:%.*]] = trunc <8 x i32> [[BROADCAST_SPLATINSERT9]] to <8 x i8> -; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <8 x i8> [[TMP43]], <8 x i8> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP44:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT10]] to <8 x i32> -; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <8 x i32> poison, i32 [[CONV11]], i64 0 -; CHECK-NEXT: [[TMP45:%.*]] = trunc <8 x i32> [[BROADCAST_SPLATINSERT11]] to <8 x i8> -; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <8 x i8> [[TMP45]], <8 x i8> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP46:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT12]] to <8 x i32> +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i32> poison, i32 [[CONV13]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT7]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP37:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT8]] to <8 x i8> +; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <8 x i32> poison, i32 [[CONV11]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT9]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP38:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT10]] to <8 x i8> ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP47:%.*]] = add i64 [[INDEX7]], 0 -; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP47]] -; CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds i16, ptr [[TMP48]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i16>, ptr [[TMP49]], align 2 -; CHECK-NEXT: [[TMP50:%.*]] = trunc <8 x i16> [[WIDE_LOAD8]] to <8 x i8> -; CHECK-NEXT: [[TMP51:%.*]] = zext <8 x i8> [[TMP50]] to <8 x i32> -; CHECK-NEXT: [[TMP52:%.*]] = trunc <8 x i32> [[TMP51]] to <8 x i8> -; CHECK-NEXT: [[TMP53:%.*]] = shl <8 x i8> [[TMP52]], -; CHECK-NEXT: [[TMP54:%.*]] = zext <8 x i8> [[TMP53]] to <8 x i32> -; CHECK-NEXT: [[TMP55:%.*]] = trunc <8 x i32> [[TMP54]] to <8 x i8> -; CHECK-NEXT: [[TMP56:%.*]] = add <8 x i8> [[TMP55]], -; CHECK-NEXT: [[TMP57:%.*]] = zext <8 x i8> [[TMP56]] to <8 x i32> -; CHECK-NEXT: [[TMP58:%.*]] = and <8 x i8> [[TMP50]], -; CHECK-NEXT: [[TMP59:%.*]] = zext <8 x i8> [[TMP58]] to <8 x i32> -; CHECK-NEXT: [[TMP60:%.*]] = trunc <8 x i32> [[TMP59]] to <8 x i8> -; CHECK-NEXT: [[TMP61:%.*]] = or <8 x i8> [[TMP60]], -; CHECK-NEXT: [[TMP62:%.*]] = zext <8 x i8> [[TMP61]] to <8 x i32> -; CHECK-NEXT: [[TMP63:%.*]] = trunc <8 x i32> [[TMP62]] to <8 x i8> -; CHECK-NEXT: [[TMP64:%.*]] = mul <8 x i8> [[TMP63]], -; CHECK-NEXT: [[TMP65:%.*]] = zext <8 x i8> [[TMP64]] to <8 x i32> -; CHECK-NEXT: [[TMP66:%.*]] = trunc <8 x i32> [[TMP57]] to <8 x i8> -; CHECK-NEXT: [[TMP67:%.*]] = trunc <8 x i32> [[TMP44]] to <8 x i8> -; CHECK-NEXT: [[TMP68:%.*]] = and <8 x i8> [[TMP66]], [[TMP67]] -; CHECK-NEXT: [[TMP69:%.*]] = zext <8 x i8> [[TMP68]] to <8 x i32> -; CHECK-NEXT: [[TMP70:%.*]] = trunc <8 x i32> [[TMP65]] to <8 x i8> -; CHECK-NEXT: [[TMP71:%.*]] = and <8 x i8> [[TMP70]], -; CHECK-NEXT: [[TMP72:%.*]] = zext <8 x i8> [[TMP71]] to <8 x i32> -; CHECK-NEXT: [[TMP73:%.*]] = trunc <8 x i32> [[TMP72]] to <8 x i8> -; CHECK-NEXT: [[TMP74:%.*]] = trunc <8 x i32> [[TMP46]] to <8 x i8> -; CHECK-NEXT: [[TMP75:%.*]] = xor <8 x i8> [[TMP73]], [[TMP74]] -; CHECK-NEXT: [[TMP76:%.*]] = zext <8 x i8> [[TMP75]] to <8 x i32> -; CHECK-NEXT: [[TMP77:%.*]] = trunc <8 x i32> [[TMP76]] to <8 x i8> -; CHECK-NEXT: [[TMP78:%.*]] = trunc <8 x i32> [[TMP69]] to <8 x i8> -; CHECK-NEXT: [[TMP79:%.*]] = mul <8 x i8> [[TMP77]], [[TMP78]] -; CHECK-NEXT: [[TMP80:%.*]] = zext <8 x i8> [[TMP79]] to <8 x i32> -; CHECK-NEXT: [[TMP81:%.*]] = trunc <8 x i32> [[TMP80]] to <8 x i8> -; CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP47]] -; CHECK-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[TMP82]], i32 0 -; CHECK-NEXT: store <8 x i8> [[TMP81]], ptr [[TMP83]], align 1 -; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX7]], 8 -; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC5]] -; CHECK-NEXT: br i1 [[TMP84]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP39:%.*]] = add i64 [[INDEX11]], 0 +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP39]] +; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds i16, ptr [[TMP40]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i16>, ptr [[TMP41]], align 2 +; CHECK-NEXT: [[TMP42:%.*]] = trunc <8 x i16> [[WIDE_LOAD12]] to <8 x i8> +; CHECK-NEXT: [[TMP43:%.*]] = shl <8 x i8> [[TMP42]], +; CHECK-NEXT: [[TMP44:%.*]] = zext <8 x i8> [[TMP43]] to <8 x i32> +; CHECK-NEXT: [[TMP45:%.*]] = trunc <8 x i32> [[TMP44]] to <8 x i8> +; CHECK-NEXT: [[TMP46:%.*]] = add <8 x i8> [[TMP45]], +; CHECK-NEXT: [[TMP47:%.*]] = zext <8 x i8> [[TMP46]] to <8 x i32> +; CHECK-NEXT: [[TMP48:%.*]] = and <8 x i8> [[TMP42]], +; CHECK-NEXT: [[TMP49:%.*]] = zext <8 x i8> [[TMP48]] to <8 x i32> +; CHECK-NEXT: [[TMP50:%.*]] = trunc <8 x i32> [[TMP49]] to <8 x i8> +; CHECK-NEXT: [[TMP51:%.*]] = or <8 x i8> [[TMP50]], +; CHECK-NEXT: [[TMP52:%.*]] = zext <8 x i8> [[TMP51]] to <8 x i32> +; CHECK-NEXT: [[TMP53:%.*]] = trunc <8 x i32> [[TMP52]] to <8 x i8> +; CHECK-NEXT: [[TMP54:%.*]] = mul <8 x i8> [[TMP53]], +; CHECK-NEXT: [[TMP55:%.*]] = zext <8 x i8> [[TMP54]] to <8 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = trunc <8 x i32> [[TMP47]] to <8 x i8> +; CHECK-NEXT: [[TMP57:%.*]] = and <8 x i8> [[TMP56]], [[TMP37]] +; CHECK-NEXT: [[TMP58:%.*]] = zext <8 x i8> [[TMP57]] to <8 x i32> +; CHECK-NEXT: [[TMP59:%.*]] = trunc <8 x i32> [[TMP55]] to <8 x i8> +; CHECK-NEXT: [[TMP60:%.*]] = and <8 x i8> [[TMP59]], +; CHECK-NEXT: [[TMP61:%.*]] = zext <8 x i8> [[TMP60]] to <8 x i32> +; CHECK-NEXT: [[TMP62:%.*]] = trunc <8 x i32> [[TMP61]] to <8 x i8> +; CHECK-NEXT: [[TMP63:%.*]] = xor <8 x i8> [[TMP62]], [[TMP38]] +; CHECK-NEXT: [[TMP64:%.*]] = zext <8 x i8> [[TMP63]] to <8 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = trunc <8 x i32> [[TMP64]] to <8 x i8> +; CHECK-NEXT: [[TMP66:%.*]] = trunc <8 x i32> [[TMP58]] to <8 x i8> +; CHECK-NEXT: [[TMP67:%.*]] = mul <8 x i8> [[TMP65]], [[TMP66]] +; CHECK-NEXT: [[TMP68:%.*]] = zext <8 x i8> [[TMP67]] to <8 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = trunc <8 x i32> [[TMP68]] to <8 x i8> +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP39]] +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds i8, ptr [[TMP70]], i32 0 +; CHECK-NEXT: store <8 x i8> [[TMP69]], ptr [[TMP71]], align 1 +; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX11]], 8 +; CHECK-NEXT: [[TMP72:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[TMP72]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -802,8 +778,8 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP85:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 -; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP85]] to i32 +; CHECK-NEXT: [[TMP73:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP73]] to i32 ; CHECK-NEXT: [[ADD:%.*]] = shl i32 [[CONV]], 4 ; CHECK-NEXT: [[CONV2:%.*]] = add nsw i32 [[ADD]], 32 ; CHECK-NEXT: [[OR:%.*]] = and i32 [[CONV]], 204 diff --git a/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll b/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll --- a/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll +++ b/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll @@ -328,16 +328,14 @@ ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1 ; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i16> -; CHECK-NEXT: [[TMP7:%.*]] = lshr <4 x i16> [[TMP6]], -; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i16> [[TMP7]] to <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i32> [[TMP8]] to <4 x i16> -; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i16> [[TMP9]] to <4 x i8> -; CHECK-NEXT: store <4 x i8> [[TMP10]], ptr [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i16> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = trunc <4 x i32> [[TMP6]] to <4 x i16> +; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i16> [[TMP7]] to <4 x i8> +; CHECK-NEXT: store <4 x i8> [[TMP8]], ptr [[TMP3]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100 -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: