Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14708,9 +14708,11 @@ unsigned AArch64TargetLowering::getNumInterleavedAccesses( VectorType *VecTy, const DataLayout &DL, bool UseScalable) const { unsigned VecSize = 128; + unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); + unsigned MinElts = VecTy->getElementCount().getKnownMinValue(); if (UseScalable) VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u); - return std::max(1, (DL.getTypeSizeInBits(VecTy) + 127) / VecSize); + return std::max(1, (MinElts * ElSize + 127) / VecSize); } MachineMemOperand::Flags @@ -14724,29 +14726,32 @@ bool AArch64TargetLowering::isLegalInterleavedAccessType( VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const { - unsigned VecSize = DL.getTypeSizeInBits(VecTy); unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); - unsigned NumElements = cast(VecTy)->getNumElements(); + auto EC = VecTy->getElementCount(); + unsigned MinElts = EC.getKnownMinValue(); UseScalable = false; - // Ensure that the predicate for this number of elements is available. - if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(NumElements)) + if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(MinElts)) return false; // Ensure the number of vector elements is greater than 1. - if (NumElements < 2) + if (MinElts < 2) return false; // Ensure the element type is legal. if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64) return false; + if (EC.isScalable()) + return MinElts * ElSize == 128; + + unsigned VecSize = DL.getTypeSizeInBits(VecTy); if (Subtarget->forceStreamingCompatibleSVE() || (Subtarget->useSVEForFixedLengthVectors() && (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 || (VecSize < Subtarget->getMinSVEVectorSizeInBits() && - isPowerOf2_32(NumElements) && VecSize > 128)))) { + isPowerOf2_32(MinElts) && VecSize > 128)))) { UseScalable = true; return true; } Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2782,19 +2782,23 @@ Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { assert(Factor >= 2 && "Invalid interleave factor"); - auto *VecVTy = cast(VecTy); + auto *VecVTy = cast(VecTy); + + if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2)) + return InstructionCost::getInvalid(); if (!UseMaskForCond && !UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { - unsigned NumElts = VecVTy->getNumElements(); + unsigned MinElts = VecVTy->getElementCount().getKnownMinValue(); auto *SubVecTy = - FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); + VectorType::get(VecVTy->getElementType(), + VecVTy->getElementCount().divideCoefficientBy(Factor)); // ldN/stN only support legal vector types of size 64 or 128 in bits. // Accesses having vector types that are a multiple of 128 bits can be // matched to more than one ldN/stN instruction. bool UseScalable; - if (NumElts % Factor == 0 && + if (MinElts % Factor == 0 && TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); } Index: llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -445,6 +445,8 @@ unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { + if (isa(VecTy)) + return InstructionCost::getInvalid(); auto *FVTy = cast(VecTy); InstructionCost MemCost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind); Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -439,6 +439,38 @@ return std::nullopt; } +/// Return a vector containing interleaved elements from multiple +/// smaller input vectors. +static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef Vals, + const Twine &Name) { + unsigned Factor = Vals.size(); + assert(Factor > 1 && "Tried to interleave invalid number of vectors"); + + VectorType *VecTy = cast(Vals[0]->getType()); +#ifndef NDEBUG + for (Value *Val : Vals) + assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); +#endif + + // Scalable vectors cannot use arbitrary shufflevectors (only splats), so + // must use intrinsics to interleave. + if (VecTy->isScalableTy()) { + VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); + SmallVector Ops(Vals.begin(), Vals.end()); + return Builder.CreateIntrinsic( + WideVecTy, Intrinsic::experimental_vector_interleave2, Ops, + /*FMFSource=*/nullptr, Name); + } + + // Fixed length. Start by concatenating all vectors into a wide vector. + Value *WideVec = concatenateVectors(Builder, Vals); + + // Interleave the elements into the wide vector. + const unsigned NumElts = VecTy->getElementCount().getFixedValue(); + return Builder.CreateShuffleVector( + WideVec, createInterleaveMask(NumElts, Factor), Name); +} + namespace { // Forward declare GeneratedRTChecks. class GeneratedRTChecks; @@ -2586,7 +2618,6 @@ // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getLoadStoreType(Instr); unsigned InterleaveFactor = Group->getFactor(); - assert(!VF.isScalable() && "scalable vectors not yet supported."); auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); // Prepare for the new pointers. @@ -2597,14 +2628,21 @@ assert((!BlockInMask || !Group->isReverse()) && "Reversed masked interleave-group not supported."); + Value *Idx; // If the group is reverse, adjust the index to refer to the last vector lane // instead of the first. We adjust the index from the first vector lane, // rather than directly getting the pointer for lane VF - 1, because the // pointer operand of the interleaved access is supposed to be uniform. For // uniform instructions, we're only required to generate a value for the // first vector lane in each unroll iteration. - if (Group->isReverse()) - Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); + if (Group->isReverse()) { + Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); + Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1)); + Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor())); + Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index)); + Idx = Builder.CreateNeg(Idx); + } else + Idx = Builder.getInt32(-Index); for (unsigned Part = 0; Part < UF; Part++) { Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); @@ -2625,8 +2663,7 @@ bool InBounds = false; if (auto *gep = dyn_cast(AddrPart->stripPointerCasts())) InBounds = gep->isInBounds(); - AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index), - "", InBounds); + AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds); // Cast to the vector pointer type. unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); @@ -2676,6 +2713,41 @@ NewLoads.push_back(NewLoad); } + if (VecTy->isScalableTy()) { + assert(InterleaveFactor == 2 && + "Unsupported deinterleave factor for scalable vectors"); + + for (unsigned Part = 0; Part < UF; ++Part) { + // Scalable vectors cannot use arbitrary shufflevectors (only splats), + // so must use intrinsics to deinterleave. + Value *DI = Builder.CreateIntrinsic( + Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part], + /*FMFSource=*/nullptr, "strided.vec"); + unsigned J = 0; + for (unsigned I = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); + + if (!Member) + continue; + + Value *StridedVec = Builder.CreateExtractValue(DI, I); + // If this member has different type, cast the result type. + if (Member->getType() != ScalarTy) { + VectorType *OtherVTy = VectorType::get(Member->getType(), VF); + StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); + } + + if (Group->isReverse()) + StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); + + State.set(VPDefs[J], StridedVec, Part); + ++J; + } + } + + return; + } + // For each member in the group, shuffle out the appropriate data from the // wide loads. unsigned J = 0; @@ -2749,14 +2821,8 @@ StoredVecs.push_back(StoredVec); } - // Concatenate all vectors into a wide vector. - Value *WideVec = concatenateVectors(Builder, StoredVecs); - - // Interleave the elements in the wide vector. - Value *IVec = Builder.CreateShuffleVector( - WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), - "interleaved.vec"); - + // Interleave all the smaller vectors into one wider vector. + Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec"); Instruction *NewStoreInstr; if (BlockInMask || MaskForGaps) { Value *GroupMask = MaskForGaps; @@ -6538,11 +6604,6 @@ InstructionCost LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, ElementCount VF) { - // TODO: Once we have support for interleaving with scalable vectors - // we can calculate the cost properly here. - if (VF.isScalable()) - return InstructionCost::getInvalid(); - Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(I); @@ -8849,9 +8910,15 @@ // single VPInterleaveRecipe. for (InterleaveGroup *IG : IAI.getInterleaveGroups()) { auto applyIG = [IG, this](ElementCount VF) -> bool { - return (VF.isVector() && // Query is illegal for VF == 1 - CM.getWideningDecision(IG->getInsertPos(), VF) == + bool Result = (VF.isVector() && // Query is illegal for VF == 1 + CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); + // For scalable vectors, the only interleave factor currently supported + // is 2 since we require the (de)interleave2 intrinsics instead of + // shufflevectors. + assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && + "Unsupported interleave factor for scalable vectors"); + return Result; }; if (!getDecisionAndClampRange(applyIG, Range)) continue; Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -606,61 +606,54 @@ ; CHECK-UNORDERED-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2 ; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = insertelement shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float [[A2]], i32 0 ; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = insertelement shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float [[A1]], i32 0 -; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = add [[TMP9]], zeroinitializer -; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = mul [[TMP10]], shufflevector ( insertelement ( poison, i64 2, i64 0), poison, zeroinitializer) -; CHECK-UNORDERED-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP11]] -; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 -; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = mul i64 2, [[TMP13]] -; CHECK-UNORDERED-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP14]], i64 0 -; CHECK-UNORDERED-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[B]], [[VEC_IND]] -; CHECK-UNORDERED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( [[TMP15]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-UNORDERED-NEXT: [[TMP16]] = fadd [[WIDE_MASKED_GATHER]], [[VEC_PHI1]] -; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = or [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[B]], [[TMP17]] -; CHECK-UNORDERED-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( [[TMP18]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-UNORDERED-NEXT: [[TMP19]] = fadd [[WIDE_MASKED_GATHER2]], [[VEC_PHI]] -; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 -; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]] -; CHECK-UNORDERED-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-UNORDERED-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]] +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_VEC:%.*]] = load , * [[TMP12]], align 4 +; CHECK-UNORDERED-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8f32( [[WIDE_VEC]]) +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-UNORDERED-NEXT: [[TMP15]] = fadd [[TMP13]], [[VEC_PHI1]] +; CHECK-UNORDERED-NEXT: [[TMP16]] = fadd [[TMP14]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 4 +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP16]]) -; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP19]]) +; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP15]]) +; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP16]]) ; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-UNORDERED: scalar.ph: ; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ] -; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX3:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] ; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-UNORDERED: for.body: ; CHECK-UNORDERED-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[ADD_PHI2:%.*]] = phi float [ [[BC_MERGE_RDX3]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ADD_PHI2:%.*]] = phi float [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ] ; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-UNORDERED-NEXT: [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] -; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = load float, float* [[ARRAYIDXB1]], align 4 -; CHECK-UNORDERED-NEXT: [[ADD1]] = fadd float [[TMP25]], [[ADD_PHI2]] +; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = load float, float* [[ARRAYIDXB1]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD1]] = fadd float [[TMP22]], [[ADD_PHI2]] ; CHECK-UNORDERED-NEXT: [[OR:%.*]] = or i64 [[IV]], 1 ; CHECK-UNORDERED-NEXT: [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[OR]] -; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = load float, float* [[ARRAYIDXB2]], align 4 -; CHECK-UNORDERED-NEXT: [[ADD2]] = fadd float [[TMP26]], [[ADD_PHI1]] +; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = load float, float* [[ARRAYIDXB2]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD2]] = fadd float [[TMP23]], [[ADD_PHI1]] ; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2 ; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-UNORDERED: for.end: -; CHECK-UNORDERED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] -; CHECK-UNORDERED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] ; CHECK-UNORDERED-NEXT: store float [[ADD1_LCSSA]], float* [[A]], align 4 ; CHECK-UNORDERED-NEXT: store float [[ADD2_LCSSA]], float* [[ARRAYIDXA]], align 4 ; CHECK-UNORDERED-NEXT: ret void @@ -684,59 +677,52 @@ ; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] ; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-ORDERED-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2 -; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = add [[TMP7]], zeroinitializer -; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = mul [[TMP8]], shufflevector ( insertelement ( poison, i64 2, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP9]] -; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 -; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = mul i64 2, [[TMP11]] -; CHECK-ORDERED-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 -; CHECK-ORDERED-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[B]], [[VEC_IND]] -; CHECK-ORDERED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( [[TMP13]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = or [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[B]], [[TMP14]] -; CHECK-ORDERED-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( [[TMP15]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-ORDERED-NEXT: [[TMP16]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[WIDE_MASKED_GATHER2]]) -; CHECK-ORDERED-NEXT: [[TMP17]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], [[WIDE_MASKED_GATHER]]) -; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 -; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] -; CHECK-ORDERED-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-ORDERED-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP7]] +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to * +; CHECK-ORDERED-NEXT: [[WIDE_VEC:%.*]] = load , * [[TMP10]], align 4 +; CHECK-ORDERED-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8f32( [[WIDE_VEC]]) +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-ORDERED-NEXT: [[TMP13]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP12]]) +; CHECK-ORDERED-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], [[TMP11]]) +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-ORDERED: middle.block: ; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED: scalar.ph: ; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] -; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX3:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED: for.body: ; CHECK-ORDERED-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[ADD_PHI2:%.*]] = phi float [ [[BC_MERGE_RDX3]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ADD_PHI2:%.*]] = phi float [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-NEXT: [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] -; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = load float, float* [[ARRAYIDXB1]], align 4 -; CHECK-ORDERED-NEXT: [[ADD1]] = fadd float [[TMP21]], [[ADD_PHI2]] +; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDXB1]], align 4 +; CHECK-ORDERED-NEXT: [[ADD1]] = fadd float [[TMP18]], [[ADD_PHI2]] ; CHECK-ORDERED-NEXT: [[OR:%.*]] = or i64 [[IV]], 1 ; CHECK-ORDERED-NEXT: [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[OR]] -; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = load float, float* [[ARRAYIDXB2]], align 4 -; CHECK-ORDERED-NEXT: [[ADD2]] = fadd float [[TMP22]], [[ADD_PHI1]] +; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = load float, float* [[ARRAYIDXB2]], align 4 +; CHECK-ORDERED-NEXT: [[ADD2]] = fadd float [[TMP19]], [[ADD_PHI1]] ; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2 ; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-ORDERED: for.end: -; CHECK-ORDERED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] -; CHECK-ORDERED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: store float [[ADD1_LCSSA]], float* [[A]], align 4 ; CHECK-ORDERED-NEXT: store float [[ADD2_LCSSA]], float* [[ARRAYIDXA]], align 4 ; CHECK-ORDERED-NEXT: ret void Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -31,37 +31,31 @@ ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 512, [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 512, [[N_MOD_VF]] ; CHECK-NEXT: [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP3:%.*]] = shl [[TMP2]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[C:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i32 [[D:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[D:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, [[VEC_IND]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP6]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP7:%.*]] = or [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, [[TMP7]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP9:%.*]] = add nsw [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP10:%.*]] = mul nsw [[WIDE_MASKED_GATHER1]], [[BROADCAST_SPLAT3]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP9]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, [[TMP7]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP10]], [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP2]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw [[TMP3]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = mul nsw [[TMP4]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 -1 +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv8i32( [[TMP6]], [[TMP7]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -159,18 +153,19 @@ ; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i16.nxv4p0( [[TMP8]], i32 2, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) ; CHECK-NEXT: [[TMP9:%.*]] = sext [[WIDE_MASKED_GATHER]] to ; CHECK-NEXT: [[TMP10:%.*]] = add nsw [[BROADCAST_SPLAT]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP10]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP12:%.*]] = sext [[WIDE_MASKED_GATHER1]] to -; CHECK-NEXT: [[TMP13:%.*]] = mul nsw [[BROADCAST_SPLAT3]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, [[TMP7]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP13]], [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] +; CHECK-NEXT: [[TMP11:%.*]] = sext [[WIDE_MASKED_GATHER1]] to +; CHECK-NEXT: [[TMP12:%.*]] = mul nsw [[BROADCAST_SPLAT3]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[TMP7]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 -1 +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv8i32( [[TMP10]], [[TMP12]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -180,17 +175,17 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i16], ptr @AB_i16, i64 0, i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i16], ptr @AB_i16, i64 0, i64 [[TMP19]] -; CHECK-NEXT: [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2 -; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP18]] to i32 +; CHECK-NEXT: [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = or i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i16], ptr @AB_i16, i64 0, i64 [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 ; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[CONV]], [[C]] ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[INDVARS_IV]] ; CHECK-NEXT: store i32 [[ADD3]], ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[CONV6:%.*]] = sext i16 [[TMP20]] to i32 +; CHECK-NEXT: [[CONV6:%.*]] = sext i16 [[TMP21]] to i32 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV6]], [[D]] -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP19]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP20]] ; CHECK-NEXT: store i32 [[MUL]], ptr [[ARRAYIDX9]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022 @@ -259,31 +254,33 @@ ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[C:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i32 [[D:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[D:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, [[VEC_IND]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP6]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP7:%.*]] = or [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, [[TMP7]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP9:%.*]] = add nsw [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP10:%.*]] = trunc [[TMP9]] to -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i16], ptr @CD_i16, i64 0, [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0( [[TMP10]], [[TMP11]], i32 2, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP12:%.*]] = mul nsw [[WIDE_MASKED_GATHER1]], [[BROADCAST_SPLAT3]] -; CHECK-NEXT: [[TMP13:%.*]] = trunc [[TMP12]] to -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i16], ptr @CD_i16, i64 0, [[TMP7]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0( [[TMP13]], [[TMP14]], i32 2, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = or [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP10:%.*]] = add nsw [[TMP7]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP11:%.*]] = trunc [[TMP10]] to +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr @CD_i16, i64 0, [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0( [[TMP11]], [[TMP12]], i32 2, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP13:%.*]] = mul nsw [[TMP8]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP14:%.*]] = trunc [[TMP13]] to +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x i16], ptr @CD_i16, i64 0, [[TMP9]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0( [[TMP14]], [[TMP15]], i32 2, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -293,17 +290,17 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[TMP19]] -; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP18]], [[C]] +; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = or i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[C]] ; CHECK-NEXT: [[CONV:%.*]] = trunc i32 [[ADD3]] to i16 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i16], ptr @CD_i16, i64 0, i64 [[INDVARS_IV]] ; CHECK-NEXT: store i16 [[CONV]], ptr [[ARRAYIDX5]], align 2 -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP20]], [[D]] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP21]], [[D]] ; CHECK-NEXT: [[CONV6:%.*]] = trunc i32 [[MUL]] to i16 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1024 x i16], ptr @CD_i16, i64 0, i64 [[TMP19]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1024 x i16], ptr @CD_i16, i64 0, i64 [[TMP20]] ; CHECK-NEXT: store i16 [[CONV6]], ptr [[ARRAYIDX9]], align 2 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022 @@ -485,40 +482,47 @@ ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[IND_END:%.*]] = add nsw i64 [[N_MOD_VF]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[INDUCTION:%.*]] = sub shufflevector ( insertelement ( poison, i64 1023, i64 0), poison, zeroinitializer), [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP3]], -4 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[DOTNEG]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv4i32() -; CHECK-NEXT: [[INDUCTION1:%.*]] = sub shufflevector ( insertelement ( poison, i32 1023, i64 0), poison, zeroinitializer), [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[DOTNEG7:%.*]] = mul nsw i32 [[TMP5]], -4 -; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement poison, i32 [[DOTNEG7]], i64 0 -; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector [[DOTSPLATINSERT2]], poison, zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i32() +; CHECK-NEXT: [[INDUCTION:%.*]] = sub shufflevector ( insertelement ( poison, i32 1023, i64 0), poison, zeroinitializer), [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i32 [[TMP3]], -4 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[DOTNEG]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND4:%.*]] = phi [ [[INDUCTION1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], [[VEC_IND]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP6]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP7:%.*]] = add nsw [[WIDE_MASKED_GATHER]], [[VEC_IND4]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[A]], [[VEC_IND]], i32 1 -; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP9:%.*]] = sub nsw [[WIDE_MASKED_GATHER6]], [[VEC_IND4]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], [[VEC_IND]], i32 0 -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP7]], [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B]], [[VEC_IND]], i32 1 -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP9]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[VEC_IND_NEXT5]] = add [[VEC_IND4]], [[DOTSPLAT3]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i32 [[TMP5]], 3 +; CHECK-NEXT: [[TMP7:%.*]] = sub nsw i32 2, [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP8]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP9]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vector.reverse.nxv4i32( [[TMP10]]) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[REVERSE1:%.*]] = call @llvm.experimental.vector.reverse.nxv4i32( [[TMP11]]) +; CHECK-NEXT: [[TMP12:%.*]] = add nsw [[REVERSE]], [[VEC_IND]] +; CHECK-NEXT: [[TMP13:%.*]] = sub nsw [[REVERSE1]], [[VEC_IND]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP15]], 3 +; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 1, [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]] +; CHECK-NEXT: [[REVERSE2:%.*]] = call @llvm.experimental.vector.reverse.nxv4i32( [[TMP12]]) +; CHECK-NEXT: [[REVERSE3:%.*]] = call @llvm.experimental.vector.reverse.nxv4i32( [[TMP13]]) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv8i32( [[REVERSE2]], [[REVERSE3]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP19]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = shl nuw nsw i64 [[TMP20]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -591,28 +595,23 @@ ; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 [[N_MOD_VF]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 512, [[TMP3]] ; CHECK-NEXT: [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP5:%.*]] = shl [[TMP4]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 3 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = and i64 [[INDEX]], 9223372036854775804 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], [[VEC_IND]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP9:%.*]] = shl nsw [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: store [[TMP9]], ptr [[TMP10]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP4]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = shl nsw [[TMP5]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP7:%.*]] = and i64 [[INDEX]], 9223372036854775804 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP7]] +; CHECK-NEXT: store [[TMP6]], ptr [[TMP8]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -681,28 +680,23 @@ ; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i64 [[TMP6]], i64 [[N_MOD_VF]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP8]] ; CHECK-NEXT: [[IND_END:%.*]] = shl i64 [[N_VEC]], 1 -; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP10:%.*]] = shl [[TMP9]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 3 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP10]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = and i64 [[INDEX]], 9223372036854775804 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], [[VEC_IND]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP13]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP14:%.*]] = shl nsw [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: store [[TMP14]], ptr [[TMP15]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP9]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = shl nsw [[TMP10]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP12:%.*]] = and i64 [[INDEX]], 9223372036854775804 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP12]] +; CHECK-NEXT: store [[TMP11]], ptr [[TMP13]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -849,35 +843,30 @@ ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 512, [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 512, [[N_MOD_VF]] ; CHECK-NEXT: [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP3:%.*]] = shl [[TMP2]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], [[VEC_IND]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP6]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP7:%.*]] = or [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], [[TMP7]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP9:%.*]] = mul nsw [[WIDE_MASKED_GATHER1]], [[WIDE_MASKED_GATHER]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP9]], [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP6]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP11:%.*]] = add nsw [[WIDE_MASKED_GATHER3]], [[WIDE_MASKED_GATHER2]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], [[TMP7]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP11]], [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP2]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = mul nsw [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = add nsw [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 -1 +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv8i32( [[TMP6]], [[TMP9]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP11]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -968,49 +957,44 @@ ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP4]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP2]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, float undef, i32 0), [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, float undef, i32 0), [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ insertelement ( zeroinitializer, i32 undef, i32 0), [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[P:%.*]], [[VEC_IND]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP5]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP6]] = add [[WIDE_MASKED_GATHER]], [[VEC_PHI1]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT]], ptr [[P]], [[VEC_IND]], i32 1 -; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0( [[TMP7]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP8]] = fadd fast [[VEC_PHI]], [[WIDE_MASKED_GATHER2]] -; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP2]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +; CHECK-NEXT: [[TMP6]] = add [[TMP3]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP7]] = fadd fast [[VEC_PHI]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP6]]) -; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP8]]) +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP6]]) +; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP7]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ undef, [[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX3:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ undef, [[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ undef, [[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i32 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ undef, [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3:%.*]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3:%.*]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: store i32 [[ADD_LCSSA]], ptr @SA, align 4 ; CHECK-NEXT: store float [[ADD3_LCSSA]], ptr @SB, align 4 ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[SUMB_014:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3]], [[FOR_BODY]] ] -; CHECK-NEXT: [[SUMA_013:%.*]] = phi i32 [ [[BC_MERGE_RDX3]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUMA_013:%.*]] = phi i32 [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ] ; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT]], ptr [[P]], i64 [[INDVARS_IV]], i32 0 ; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[A]], align 4 ; CHECK-NEXT: [[ADD]] = add nsw i32 [[LOAD1]], [[SUMA_013]] @@ -1091,14 +1075,17 @@ ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], [[VEC_IND]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], [[VEC_IND]], i32 1 ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_GATHER]], [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement [[TMP9]], i64 0 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP11]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP12]], [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -1169,39 +1156,44 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], [[VEC_IND]], i32 0 +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], [[VEC_IND]], i32 1 -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_GATHER]], [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP11]] = add [[WIDE_MASKED_GATHER1]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP9]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP11]], [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP12:%.*]] = extractelement [[TMP10]], i64 0 +; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load , ptr [[TMP12]], align 4 +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC1]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; CHECK-NEXT: [[TMP14]] = add [[TMP13]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP11]]) +; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP17:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP20:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 0 ; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1 -; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[P_I_X]], align 4 -; CHECK-NEXT: store i32 [[TMP16]], ptr [[P_I_Y]], align 4 -; CHECK-NEXT: [[TMP17]] = add nsw i32 [[TMP16]], [[S]] +; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[P_I_X]], align 4 +; CHECK-NEXT: store i32 [[TMP19]], ptr [[P_I_Y]], align 4 +; CHECK-NEXT: [[TMP20]] = add nsw i32 [[TMP19]], [[S]] ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: ret i32 [[TMP17]] +; CHECK-NEXT: ret i32 [[TMP20]] ; entry: br label %for.body @@ -1261,17 +1253,19 @@ ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], [[VEC_IND]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], shufflevector ( insertelement ( poison, i64 -1, i64 0), poison, zeroinitializer), i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 -1, i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], [[VEC_IND]], i32 1 ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_GATHER]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP10]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP12]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -1283,8 +1277,8 @@ ; CHECK-NEXT: [[P_I_MINUS_1_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 -1, i32 0 ; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1 ; CHECK-NEXT: store i32 [[Z]], ptr [[P_I_X]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[P_I_MINUS_1_X]], align 4 -; CHECK-NEXT: store i32 [[TMP15]], ptr [[P_I_Y]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[P_I_MINUS_1_X]], align 4 +; CHECK-NEXT: store i32 [[TMP16]], ptr [[P_I_Y]], align 4 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP27:![0-9]+]] @@ -1345,44 +1339,48 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], [[VEC_IND]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], [[VEC_IND]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], [[TMP9]], i32 1 -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_GATHER]], [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP13]] = add [[WIDE_MASKED_GATHER1]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP10]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP13]], [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load , ptr [[TMP11]], align 4 +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC1]]) +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; CHECK-NEXT: [[TMP15]] = add [[TMP14]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP13]]) +; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP15]]) ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP20:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP22:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[I_PLUS_1:%.*]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 0 ; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1 ; CHECK-NEXT: [[P_I_PLUS_1_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I_PLUS_1]], i32 1 -; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[P_I_X]], align 4 -; CHECK-NEXT: store i32 [[TMP18]], ptr [[P_I_PLUS_1_Y]], align 4 -; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[P_I_Y]], align 4 -; CHECK-NEXT: [[TMP20]] = add nsw i32 [[TMP19]], [[S]] +; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[P_I_X]], align 4 +; CHECK-NEXT: store i32 [[TMP20]], ptr [[P_I_PLUS_1_Y]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[P_I_Y]], align 4 +; CHECK-NEXT: [[TMP22]] = add nsw i32 [[TMP21]], [[S]] ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP29:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: ret i32 [[TMP20]] +; CHECK-NEXT: ret i32 [[TMP22]] ; entry: br label %for.body @@ -1452,18 +1450,20 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP8]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP11:%.*]] = add [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[OFFSET_IDX]], 1 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], [[VEC_IND]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]] ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[BROADCAST_SPLAT2]], [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[BROADCAST_SPLAT4]], [[TMP13]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 -1 +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv8i32( [[BROADCAST_SPLAT2]], [[BROADCAST_SPLAT4]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP14]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -30,48 +30,43 @@ ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP4]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[C]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2 -; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP5]], 6 -; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[VECTOR_GEP:%.*]] = shl [[TMP8]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[VECTOR_GEP]] -; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement poison, i64 [[TMP6]], i64 0 -; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector [[DOTSPLATINSERT2]], poison, zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP11:%.*]] = add [[DOTSPLAT3]], [[TMP10]] -; CHECK-NEXT: [[VECTOR_GEP4:%.*]] = shl [[TMP11]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[VECTOR_GEP4]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, [[TMP9]], i64 1 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, [[TMP12]], i64 1 -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP13]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP15:%.*]] = add nsw [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP16:%.*]] = add nsw [[WIDE_MASKED_GATHER5]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: store [[TMP15]], ptr [[TMP17]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i64 [[TMP18]], 2 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP19]] -; CHECK-NEXT: store [[TMP16]], ptr [[TMP20]], align 4 -; CHECK-NEXT: [[TMP21:%.*]] = add nsw [[WIDE_MASKED_GATHER6]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP22:%.*]] = add nsw [[WIDE_MASKED_GATHER7]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: store [[TMP21]], ptr [[TMP23]], align 4 -; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP25:%.*]] = shl nuw nsw i64 [[TMP24]], 2 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP25]] -; CHECK-NEXT: store [[TMP22]], ptr [[TMP26]], align 4 -; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP28:%.*]] = shl nuw nsw i64 [[TMP27]], 3 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP28]] -; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[INDEX]], 3 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[C]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 5 +; CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[C]], i64 [[TMP9]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[NEXT_GEP]], align 4 +; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load , ptr [[NEXT_GEP2]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC3]]) +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = add nsw [[TMP10]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP15:%.*]] = add nsw [[TMP12]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: store [[TMP14]], ptr [[TMP16]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = shl nuw nsw i64 [[TMP17]], 2 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP18]] +; CHECK-NEXT: store [[TMP15]], ptr [[TMP19]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = add nsw [[TMP11]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP21:%.*]] = add nsw [[TMP13]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: store [[TMP20]], ptr [[TMP22]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP24:%.*]] = shl nuw nsw i64 [[TMP23]], 2 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP24]] +; CHECK-NEXT: store [[TMP21]], ptr [[TMP25]], align 4 +; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP27:%.*]] = shl nuw nsw i64 [[TMP26]], 3 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP27]] +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] @@ -83,13 +78,13 @@ ; CHECK-NEXT: [[PTR_014:%.*]] = phi ptr [ [[INCDEC_PTR1:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[PTR_014]], i64 1 -; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[PTR_014]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[PTR_014]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1]] = getelementptr inbounds i32, ptr [[PTR_014]], i64 2 -; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP30]], 1 +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP29]], 1 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_013]] ; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP31]], 1 +; CHECK-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP30]], 1 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_013]] ; CHECK-NEXT: store i32 [[ADD2]], ptr [[ARRAYIDX3]], align 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_013]], 1