Index: llvm/include/llvm/Analysis/VectorUtils.h =================================================================== --- llvm/include/llvm/Analysis/VectorUtils.h +++ llvm/include/llvm/Analysis/VectorUtils.h @@ -593,6 +593,16 @@ /// elements, it will be padded with undefs. Value *concatenateVectors(IRBuilderBase &Builder, ArrayRef Vecs); +/// Return an anonymous struct value containing multiple smaller vectors +/// deinterleaved from the larger input vector. +Value *deinterleaveVector(IRBuilderBase &Builder, Value *Val, uint32_t Factor, + const Twine &Name = ""); + +/// Return a vector containing interleaved elements from multiple +/// smaller input vectors. +Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef Vals, + const Twine &Name = ""); + /// Given a mask vector of i1, Return true if all of the elements of this /// predicate mask are known to be false or undef. That is, return true if all /// lanes can be assumed inactive. Index: llvm/lib/Analysis/VectorUtils.cpp =================================================================== --- llvm/lib/Analysis/VectorUtils.cpp +++ llvm/lib/Analysis/VectorUtils.cpp @@ -929,6 +929,72 @@ return ResList[0]; } +Value *llvm::deinterleaveVector(IRBuilderBase &Builder, Value *Val, + uint32_t Factor, const Twine &Name) { + assert(Factor > 1 && "Tried to deinterleave to invalid number of vectors"); + VectorType *VecTy = cast(Val->getType()); + + // Scalable vectors cannot use arbitrary shufflevectors (only splats), so + // must use intrinsics to deinterleave. + if (VecTy->isScalableTy()) { + assert(Factor == 2 && + "Unsupported deinterleave factor for scalable vectors"); + return Builder.CreateIntrinsic(Intrinsic::experimental_vector_deinterleave2, + VecTy, Val, /*FMFSource=*/nullptr, Name); + } + + // Fixed length. Use multiple shufflevectors and insert into an anonymous + // struct. InstCombine can fold away any insert/extract pairs. + const unsigned NumElts = VecTy->getElementCount().getFixedValue() / Factor; + FixedVectorType *NewVecTy = + FixedVectorType::get(VecTy->getElementType(), NumElts); + SmallVector SubVecTys; + for (unsigned I = 0; I < Factor; ++I) + SubVecTys.push_back(NewVecTy); + + StructType *RetTy = StructType::create(SubVecTys); + Value *RetVals = PoisonValue::get(RetTy); + for (unsigned I = 0; I < Factor; ++I) { + auto StrideMask = createStrideMask(I, Factor, NumElts); + + RetVals = Builder.CreateInsertValue( + RetVals, Builder.CreateShuffleVector(Val, StrideMask, Name), I); + } + + return RetVals; +} + +Value *llvm::interleaveVectors(IRBuilderBase &Builder, ArrayRef Vals, + const Twine &Name) { + unsigned Factor = Vals.size(); + assert(Factor > 1 && "Tried to interleave invalid number of vectors"); + + VectorType *VecTy = cast(Vals[0]->getType()); +#ifndef NDEBUG + for (Value *Val : Vals) + assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); +#endif + + // Scalable vectors cannot use arbitrary shufflevectors (only splats), so + // must use intrinsics to interleave. + if (VecTy->isScalableTy()) { + assert(Factor == 2 && "Unsupported interleave factor for scalable vectors"); + VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); + SmallVector Ops(Vals.begin(), Vals.end()); + return Builder.CreateIntrinsic( + WideVecTy, Intrinsic::experimental_vector_interleave2, Ops, + /*FMFSource=*/nullptr, Name); + } + + // Fixed length. Start by concatenating all vectors into a wide vector. + Value *WideVec = concatenateVectors(Builder, Vals); + + // Interleave the elements into the wide vector. + const unsigned NumElts = VecTy->getElementCount().getFixedValue(); + return Builder.CreateShuffleVector( + WideVec, createInterleaveMask(NumElts, Factor), Name); +} + bool llvm::maskIsAllZeroOrUndef(Value *Mask) { assert(isa(Mask->getType()) && isa(Mask->getType()->getScalarType()) && Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14325,9 +14325,11 @@ unsigned AArch64TargetLowering::getNumInterleavedAccesses( VectorType *VecTy, const DataLayout &DL, bool UseScalable) const { unsigned VecSize = 128; + unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); + unsigned MinElts = VecTy->getElementCount().getKnownMinValue(); if (UseScalable) VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u); - return std::max(1, (DL.getTypeSizeInBits(VecTy) + 127) / VecSize); + return std::max(1, (MinElts * ElSize + 127) / VecSize); } MachineMemOperand::Flags @@ -14341,29 +14343,36 @@ bool AArch64TargetLowering::isLegalInterleavedAccessType( VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const { - unsigned VecSize = DL.getTypeSizeInBits(VecTy); unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); - unsigned NumElements = cast(VecTy)->getNumElements(); + auto EC = VecTy->getElementCount(); + unsigned MinElts = EC.getKnownMinValue(); UseScalable = false; - // Ensure that the predicate for this number of elements is available. - if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(NumElements)) + if (Subtarget->hasSVE() && + !getSVEPredPatternFromNumElements(EC.getKnownMinValue())) return false; // Ensure the number of vector elements is greater than 1. - if (NumElements < 2) + if (MinElts < 2) return false; // Ensure the element type is legal. if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64) return false; + if (EC.isScalable()) { + if (MinElts * ElSize == 128) + return true; + return false; + } + + unsigned VecSize = DL.getTypeSizeInBits(VecTy); if (Subtarget->forceStreamingCompatibleSVE() || (Subtarget->useSVEForFixedLengthVectors() && (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 || (VecSize < Subtarget->getMinSVEVectorSizeInBits() && - isPowerOf2_32(NumElements) && VecSize > 128)))) { + isPowerOf2_32(MinElts) && VecSize > 128)))) { UseScalable = true; return true; } Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2663,19 +2663,23 @@ Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { assert(Factor >= 2 && "Invalid interleave factor"); - auto *VecVTy = cast(VecTy); + auto *VecVTy = cast(VecTy); + + if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2)) + return InstructionCost::getInvalid(); if (!UseMaskForCond && !UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { - unsigned NumElts = VecVTy->getNumElements(); + unsigned MinElts = VecVTy->getElementCount().getKnownMinValue(); auto *SubVecTy = - FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); + VectorType::get(VecVTy->getElementType(), + VecVTy->getElementCount().divideCoefficientBy(Factor)); // ldN/stN only support legal vector types of size 64 or 128 in bits. // Accesses having vector types that are a multiple of 128 bits can be // matched to more than one ldN/stN instruction. bool UseScalable; - if (NumElts % Factor == 0 && + if (MinElts % Factor == 0 && TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); } Index: llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -439,6 +439,8 @@ unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { + if (isa(VecTy)) + return InstructionCost::getInvalid(); auto *FVTy = cast(VecTy); InstructionCost MemCost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind); Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2648,7 +2648,6 @@ // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getLoadStoreType(Instr); unsigned InterleaveFactor = Group->getFactor(); - assert(!VF.isScalable() && "scalable vectors not yet supported."); auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); // Prepare for the new pointers. @@ -2659,14 +2658,21 @@ assert((!BlockInMask || !Group->isReverse()) && "Reversed masked interleave-group not supported."); + Value *Idx; // If the group is reverse, adjust the index to refer to the last vector lane // instead of the first. We adjust the index from the first vector lane, // rather than directly getting the pointer for lane VF - 1, because the // pointer operand of the interleaved access is supposed to be uniform. For // uniform instructions, we're only required to generate a value for the // first vector lane in each unroll iteration. - if (Group->isReverse()) - Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); + if (Group->isReverse()) { + Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); + Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1)); + Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor())); + Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index)); + Idx = Builder.CreateNeg(Idx); + } else + Idx = Builder.getInt32(-Index); for (unsigned Part = 0; Part < UF; Part++) { Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); @@ -2687,8 +2693,7 @@ bool InBounds = false; if (auto *gep = dyn_cast(AddrPart->stripPointerCasts())) InBounds = gep->isInBounds(); - AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index), - "", InBounds); + AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds); // Cast to the vector pointer type. unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); @@ -2734,8 +2739,10 @@ else NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], Group->getAlign(), "wide.vec"); + Group->addMetadata(NewLoad); - NewLoads.push_back(NewLoad); + NewLoads.push_back(deinterleaveVector(Builder, NewLoad, InterleaveFactor, + "strided.vec")); } // For each member in the group, shuffle out the appropriate data from the @@ -2751,12 +2758,10 @@ auto StrideMask = createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); for (unsigned Part = 0; Part < UF; Part++) { - Value *StridedVec = Builder.CreateShuffleVector( - NewLoads[Part], StrideMask, "strided.vec"); + Value *StridedVec = Builder.CreateExtractValue(NewLoads[Part], I); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { - assert(!VF.isScalable() && "VF is assumed to be non scalable."); VectorType *OtherVTy = VectorType::get(Member->getType(), VF); StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); } @@ -2811,14 +2816,8 @@ StoredVecs.push_back(StoredVec); } - // Concatenate all vectors into a wide vector. - Value *WideVec = concatenateVectors(Builder, StoredVecs); - - // Interleave the elements in the wide vector. - Value *IVec = Builder.CreateShuffleVector( - WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), - "interleaved.vec"); - + // Interleave all the smaller vectors into one wider vector. + Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec"); Instruction *NewStoreInstr; if (BlockInMask || MaskForGaps) { Value *GroupMask = MaskForGaps; @@ -6610,11 +6609,6 @@ InstructionCost LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, ElementCount VF) { - // TODO: Once we have support for interleaving with scalable vectors - // we can calculate the cost properly here. - if (VF.isScalable()) - return InstructionCost::getInvalid(); - Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(I); Index: llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll @@ -103,10 +103,10 @@ ; CHECK-LABEL: @ptr_ind_plus2( ; CHECK: %[[V0:.*]] = load <8 x i32> -; CHECK: %[[V1:.*]] = load <8 x i32> ; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> poison, <4 x i32> -; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> poison, <4 x i32> ; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> poison, <4 x i32> +; CHECK: %[[V1:.*]] = load <8 x i32> +; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> poison, <4 x i32> ; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> poison, <4 x i32> ; CHECK: mul nsw <4 x i32> ; CHECK: mul nsw <4 x i32> Index: llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll @@ -11,10 +11,10 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 40004 -; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 80007 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[UGLYGEP1]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[UGLYGEP]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 40004 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 80007 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -32,26 +32,30 @@ ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 1 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 1 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 1 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 1 -; CHECK-NEXT: [[TMP13:%.*]] = load i24, ptr [[TMP9]], align 4, !alias.scope !0 -; CHECK-NEXT: [[TMP14:%.*]] = load i24, ptr [[TMP10]], align 4, !alias.scope !0 -; CHECK-NEXT: [[TMP15:%.*]] = load i24, ptr [[TMP11]], align 4, !alias.scope !0 +; CHECK-NEXT: [[TMP9:%.*]] = insertvalue [[TMP0]] poison, <4 x i32> [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = insertvalue [[TMP0]] [[TMP9]], <4 x i32> [[STRIDED_VEC2]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue [[TMP0]] [[TMP10]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 1 ; CHECK-NEXT: [[TMP16:%.*]] = load i24, ptr [[TMP12]], align 4, !alias.scope !0 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i24> poison, i24 [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i24> [[TMP17]], i24 [[TMP14]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i24> [[TMP18]], i24 [[TMP15]], i32 2 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i24> [[TMP19]], i24 [[TMP16]], i32 3 -; CHECK-NEXT: [[TMP21:%.*]] = zext <4 x i24> [[TMP20]] to <4 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 0 -; CHECK-NEXT: store <4 x i32> [[TMP22]], ptr [[TMP24]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP17:%.*]] = load i24, ptr [[TMP13]], align 4, !alias.scope !0 +; CHECK-NEXT: [[TMP18:%.*]] = load i24, ptr [[TMP14]], align 4, !alias.scope !0 +; CHECK-NEXT: [[TMP19:%.*]] = load i24, ptr [[TMP15]], align 4, !alias.scope !0 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i24> poison, i24 [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i24> [[TMP20]], i24 [[TMP17]], i32 1 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i24> [[TMP21]], i24 [[TMP18]], i32 2 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x i24> [[TMP22]], i24 [[TMP19]], i32 3 +; CHECK-NEXT: [[TMP24:%.*]] = zext <4 x i24> [[TMP23]] to <4 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = add <4 x i32> [[TMP11]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP25]], ptr [[TMP27]], align 4, !alias.scope !3, !noalias !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 +; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -prefer-predicate-over-epilogue=scalar-epilogue \ ; RUN: -force-ordered-reductions=false -hints-allow-reordering=false -S | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -prefer-predicate-over-epilogue=scalar-epilogue \ @@ -12,43 +13,178 @@ ; RUN: -hints-allow-reordering=false -S | FileCheck %s --check-prefix=CHECK-ORDERED-TF define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) #0 { -; CHECK-ORDERED-LABEL: @fadd_strict -; CHECK-ORDERED: vector.body: -; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ] -; CHECK-ORDERED: %[[LOAD:.*]] = load , * -; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[VEC_PHI]], %[[LOAD]]) -; CHECK-ORDERED: for.end -; CHECK-ORDERED: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-ORDERED: ret float %[[PHI]] - -; CHECK-ORDERED-TF-LABEL: @fadd_strict -; CHECK-ORDERED-TF: vector.body: -; CHECK-ORDERED-TF: %[[ACTIVE_LANE_MASK:.*]] = phi -; CHECK-ORDERED-TF: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ] -; CHECK-ORDERED-TF: %[[LOAD:.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* -; CHECK-ORDERED-TF: %[[SEL:.*]] = select %[[ACTIVE_LANE_MASK]], %[[LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF: %[[RDX]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[VEC_PHI]], %[[SEL]]) -; CHECK-ORDERED-TF: for.end -; CHECK-ORDERED-TF: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-ORDERED-TF: ret float %[[PHI]] - -; CHECK-UNORDERED-LABEL: @fadd_strict -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ %[[FADD_VEC:.*]], %vector.body ] -; CHECK-UNORDERED: %[[LOAD_VEC:.*]] = load , * -; CHECK-UNORDERED: %[[FADD_VEC]] = fadd %[[LOAD_VEC]], %[[VEC_PHI]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, %[[FADD_VEC]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[LOAD:.*]] = load float, float* -; CHECK-UNORDERED: %[[FADD:.*]] = fadd float %[[LOAD]], {{.*}} -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float %[[RES]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_strict +; CHECK-NOT-VECTORIZED-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd float [[TMP0]], [[SUM_07]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_strict +; CHECK-UNORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP7]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP8]] = fadd [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 8 +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[TMP8]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD]] = fadd float [[TMP13]], [[SUM_07]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fadd_strict +; CHECK-ORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP7]], align 4 +; CHECK-ORDERED-NEXT: [[TMP8]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[WIDE_LOAD]]) +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 8 +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD]] = fadd float [[TMP12]], [[SUM_07]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-ORDERED-TF-LABEL: define float @fadd_strict +; CHECK-ORDERED-TF-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-ORDERED-TF-NEXT: entry: +; CHECK-ORDERED-TF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED-TF: vector.ph: +; CHECK-ORDERED-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-ORDERED-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]] +; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED-TF: vector.body: +; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP10]] +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP15]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP14]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 8 +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP17]] +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = extractelement [[TMP18]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-ORDERED-TF: middle.block: +; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED-TF: scalar.ph: +; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED-TF: for.body: +; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[ADD]] = fadd float [[TMP20]], [[SUM_07]] +; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-ORDERED-TF: for.end: +; CHECK-ORDERED-TF-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: ret float [[ADD_LCSSA]] +; + + + entry: br label %for.body @@ -68,75 +204,340 @@ } define float @fadd_strict_unroll(float* noalias nocapture readonly %a, i64 %n) #0 { -; CHECK-ORDERED-LABEL: @fadd_strict_unroll -; CHECK-ORDERED: vector.body: -; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ] -; CHECK-ORDERED-NOT: phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ] -; CHECK-ORDERED: %[[LOAD1:.*]] = load , * -; CHECK-ORDERED: %[[LOAD2:.*]] = load , * -; CHECK-ORDERED: %[[LOAD3:.*]] = load , * -; CHECK-ORDERED: %[[LOAD4:.*]] = load , * -; CHECK-ORDERED: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[VEC_PHI1]], %[[LOAD1]]) -; CHECK-ORDERED: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[RDX1]], %[[LOAD2]]) -; CHECK-ORDERED: %[[RDX3:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[RDX2]], %[[LOAD3]]) -; CHECK-ORDERED: %[[RDX4]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[RDX3]], %[[LOAD4]]) -; CHECK-ORDERED: for.end -; CHECK-ORDERED: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX4]], %middle.block ] -; CHECK-ORDERED: ret float %[[PHI]] - -; CHECK-ORDERED-TF-LABEL: @fadd_strict_unroll -; CHECK-ORDERED-TF: vector.body: -; CHECK-ORDERED-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi -; CHECK-ORDERED-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi -; CHECK-ORDERED-TF: %[[ACTIVE_LANE_MASK3:.*]] = phi -; CHECK-ORDERED-TF: %[[ACTIVE_LANE_MASK4:.*]] = phi -; CHECK-ORDERED-TF: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ] -; CHECK-ORDERED-TF-NOT: phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ] -; CHECK-ORDERED-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* -; CHECK-ORDERED-TF: %[[LOAD2:.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* -; CHECK-ORDERED-TF: %[[LOAD3:.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* -; CHECK-ORDERED-TF: %[[LOAD4:.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* -; CHECK-ORDERED-TF: %[[SEL1:.*]] = select %[[ACTIVE_LANE_MASK1]], %[[LOAD1]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[VEC_PHI1]], %[[SEL1]]) -; CHECK-ORDERED-TF: %[[SEL2:.*]] = select %[[ACTIVE_LANE_MASK2]], %[[LOAD2]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[RDX1]], %[[SEL2]]) -; CHECK-ORDERED-TF: %[[SEL3:.*]] = select %[[ACTIVE_LANE_MASK3]], %[[LOAD3]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF: %[[RDX3:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[RDX2]], %[[SEL3]]) -; CHECK-ORDERED-TF: %[[SEL4:.*]] = select %[[ACTIVE_LANE_MASK4]], %[[LOAD4]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF: %[[RDX4]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[RDX3]], %[[SEL4]]) -; CHECK-ORDERED-TF: for.end -; CHECK-ORDERED-TF: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX4]], %middle.block ] -; CHECK-ORDERED-TF: ret float %[[PHI]] - -; CHECK-UNORDERED-LABEL: @fadd_strict_unroll -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ %[[VEC_FADD1:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI3:.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), %vector.ph ], [ %[[VEC_FADD3:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI4:.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), %vector.ph ], [ %[[VEC_FADD4:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load , * -; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load , * -; CHECK-UNORDERED: %[[VEC_LOAD3:.*]] = load , * -; CHECK-UNORDERED: %[[VEC_LOAD4:.*]] = load , * -; CHECK-UNORDERED: %[[VEC_FADD1]] = fadd %[[VEC_LOAD1]], %[[VEC_PHI1]] -; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd %[[VEC_LOAD2]], %[[VEC_PHI2]] -; CHECK-UNORDERED: %[[VEC_FADD3]] = fadd %[[VEC_LOAD3]], %[[VEC_PHI3]] -; CHECK-UNORDERED: %[[VEC_FADD4]] = fadd %[[VEC_LOAD4]], %[[VEC_PHI4]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd %[[VEC_FADD2]], %[[VEC_FADD1]] -; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd %[[VEC_FADD3]], %[[BIN_RDX1]] -; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd %[[VEC_FADD4]], %[[BIN_RDX2]] -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, %[[BIN_RDX3]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[LOAD:.*]] = load float, float* -; CHECK-UNORDERED: %[[FADD:.*]] = fadd float %[[LOAD]], {{.*}} -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float %[[RES]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_unroll -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_strict_unroll +; CHECK-NOT-VECTORIZED-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd float [[TMP0]], [[SUM_07]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_strict_unroll +; CHECK-UNORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32 +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP40:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]] +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24 +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] +; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]] +; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]] +; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]] +; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]] +; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP25]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 8 +; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[TMP27]] +; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , * [[TMP29]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 16 +; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[TMP31]] +; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , * [[TMP33]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 24 +; CHECK-UNORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[TMP35]] +; CHECK-UNORDERED-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , * [[TMP37]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP38]] = fadd [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[TMP39]] = fadd [[WIDE_LOAD4]], [[VEC_PHI1]] +; CHECK-UNORDERED-NEXT: [[TMP40]] = fadd [[WIDE_LOAD5]], [[VEC_PHI2]] +; CHECK-UNORDERED-NEXT: [[TMP41]] = fadd [[WIDE_LOAD6]], [[VEC_PHI3]] +; CHECK-UNORDERED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 32 +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP43]] +; CHECK-UNORDERED-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd [[TMP39]], [[TMP38]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX7:%.*]] = fadd [[TMP40]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX8:%.*]] = fadd [[TMP41]], [[BIN_RDX7]] +; CHECK-UNORDERED-NEXT: [[TMP45:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[BIN_RDX8]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP45]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP46:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD]] = fadd float [[TMP46]], [[SUM_07]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP45]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fadd_strict_unroll +; CHECK-ORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32 +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]] +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24 +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 +; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 +; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] +; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]] +; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]] +; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]] +; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]] +; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP25]], align 4 +; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 8 +; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[TMP27]] +; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , * [[TMP29]], align 4 +; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 16 +; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[TMP31]] +; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load , * [[TMP33]], align 4 +; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 24 +; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[TMP35]] +; CHECK-ORDERED-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load , * [[TMP37]], align 4 +; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[WIDE_LOAD]]) +; CHECK-ORDERED-NEXT: [[TMP39:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP38]], [[WIDE_LOAD1]]) +; CHECK-ORDERED-NEXT: [[TMP40:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP39]], [[WIDE_LOAD2]]) +; CHECK-ORDERED-NEXT: [[TMP41]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP40]], [[WIDE_LOAD3]]) +; CHECK-ORDERED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 32 +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP43]] +; CHECK-ORDERED-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP45:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD]] = fadd float [[TMP45]], [[SUM_07]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-ORDERED-TF-LABEL: define float @fadd_strict_unroll +; CHECK-ORDERED-TF-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-ORDERED-TF-NEXT: entry: +; CHECK-ORDERED-TF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED-TF: vector.ph: +; CHECK-ORDERED-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-ORDERED-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]] +; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP8]] +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 24 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP10]] +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = sub i64 [[N]], [[TMP12]] +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[N]], [[TMP12]] +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = sub i64 [[N]], [[TMP17]] +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[N]], [[TMP17]] +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = sub i64 [[N]], [[TMP22]] +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = icmp ugt i64 [[N]], [[TMP22]] +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = sub i64 [[N]], [[TMP27]] +; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = icmp ugt i64 [[N]], [[TMP27]] +; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i64 [[TMP28]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED-TF: vector.body: +; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT12:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT13:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT14:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP72:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = add i64 [[INDEX]], [[TMP35]] +; CHECK-ORDERED-TF-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = add i64 [[TMP38]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], [[TMP40]] +; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = add i64 [[TMP43]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = add i64 [[INDEX]], [[TMP45]] +; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP31]] +; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP36]] +; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP41]] +; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP46]] +; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, float* [[TMP47]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = bitcast float* [[TMP51]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* [[TMP52]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = mul i64 [[TMP53]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = getelementptr inbounds float, float* [[TMP47]], i64 [[TMP54]] +; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = bitcast float* [[TMP55]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* [[TMP56]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = mul i64 [[TMP57]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[TMP47]], i64 [[TMP58]] +; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = bitcast float* [[TMP59]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* [[TMP60]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = mul i64 [[TMP61]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, float* [[TMP47]], i64 [[TMP62]] +; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = bitcast float* [[TMP63]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* [[TMP64]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP65]]) +; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = select [[ACTIVE_LANE_MASK6]], [[WIDE_MASKED_LOAD9]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP68:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP66]], [[TMP67]]) +; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = select [[ACTIVE_LANE_MASK7]], [[WIDE_MASKED_LOAD10]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP68]], [[TMP69]]) +; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = select [[ACTIVE_LANE_MASK8]], [[WIDE_MASKED_LOAD11]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP72]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP70]], [[TMP71]]) +; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = mul i64 [[TMP73]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = add i64 [[INDEX]], [[TMP74]] +; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = mul i64 [[TMP76]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = add i64 [[INDEX]], [[TMP77]] +; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP80:%.*]] = mul i64 [[TMP79]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = add i64 [[INDEX]], [[TMP80]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT12]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP75]], i64 [[TMP20]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP78]], i64 [[TMP25]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP81]], i64 [[TMP30]]) +; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP83:%.*]] = mul i64 [[TMP82]], 32 +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP83]] +; CHECK-ORDERED-TF-NEXT: [[TMP84:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP86:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = extractelement [[TMP84]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP88]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-ORDERED-TF: middle.block: +; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED-TF: scalar.ph: +; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP72]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED-TF: for.body: +; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[ADD]] = fadd float [[TMP89]], [[SUM_07]] +; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-ORDERED-TF: for.end: +; CHECK-ORDERED-TF-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP72]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: ret float [[ADD_LCSSA]] +; + + + entry: br label %for.body @@ -156,93 +557,266 @@ } define void @fadd_strict_interleave(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 { -; CHECK-ORDERED-LABEL: @fadd_strict_interleave -; CHECK-ORDERED: entry -; CHECK-ORDERED: %[[ARRAYIDX:.*]] = getelementptr inbounds float, float* %a, i64 1 -; CHECK-ORDERED: %[[LOAD1:.*]] = load float, float* %a -; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float* %[[ARRAYIDX]] -; CHECK-ORDERED: vector.ph -; CHECK-ORDERED: %[[STEPVEC1:.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-ORDERED: %[[STEPVEC_ADD1:.*]] = add %[[STEPVEC1]], zeroinitializer -; CHECK-ORDERED: %[[STEPVEC_MUL:.*]] = mul %[[STEPVEC_ADD1]], shufflevector ( insertelement ( poison, i64 2, i64 0), poison, zeroinitializer) -; CHECK-ORDERED: %[[INDUCTION:.*]] = add zeroinitializer, %[[STEPVEC_MUL]] -; CHECK-ORDERED: vector.body -; CHECK-ORDERED: %[[VEC_PHI2:.*]] = phi float [ %[[LOAD2]], %vector.ph ], [ %[[RDX2:.*]], %vector.body ] -; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ %[[LOAD1]], %vector.ph ], [ %[[RDX1:.*]], %vector.body ] -; CHECK-ORDERED: %[[VEC_IND:.*]] = phi [ %[[INDUCTION]], %vector.ph ], [ {{.*}}, %vector.body ] -; CHECK-ORDERED: %[[GEP1:.*]] = getelementptr inbounds float, float* %b, %[[VEC_IND]] -; CHECK-ORDERED: %[[MGATHER1:.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( %[[GEP1]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-ORDERED: %[[OR:.*]] = or %[[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-ORDERED: %[[GEP2:.*]] = getelementptr inbounds float, float* %b, %[[OR]] -; CHECK-ORDERED: %[[MGATHER2:.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( %[[GEP2]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-ORDERED: %[[RDX2]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI2]], %[[MGATHER2]]) -; CHECK-ORDERED: %[[RDX1]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI1]], %[[MGATHER1]]) - -; CHECK-ORDERED-TF-LABEL: @fadd_strict_interleave -; CHECK-ORDERED-TF: entry -; CHECK-ORDERED-TF: %[[ARRAYIDX:.*]] = getelementptr inbounds float, float* %a, i64 1 -; CHECK-ORDERED-TF: %[[LOAD1:.*]] = load float, float* %a -; CHECK-ORDERED-TF: %[[LOAD2:.*]] = load float, float* %[[ARRAYIDX]] -; CHECK-ORDERED-TF: vector.ph -; CHECK-ORDERED-TF: %[[STEPVEC1:.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-ORDERED-TF: %[[STEPVEC_ADD1:.*]] = add %[[STEPVEC1]], zeroinitializer -; CHECK-ORDERED-TF: %[[STEPVEC_MUL:.*]] = mul %[[STEPVEC_ADD1]], shufflevector ( insertelement ( poison, i64 2, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF: %[[INDUCTION:.*]] = add zeroinitializer, %[[STEPVEC_MUL]] -; CHECK-ORDERED-TF: vector.body -; CHECK-ORDERED-TF: %[[ACTIVE_LANE_MASK:.*]] = phi -; CHECK-ORDERED-TF: %[[VEC_PHI2:.*]] = phi float [ %[[LOAD2]], %vector.ph ], [ %[[RDX2:.*]], %vector.body ] -; CHECK-ORDERED-TF: %[[VEC_PHI1:.*]] = phi float [ %[[LOAD1]], %vector.ph ], [ %[[RDX1:.*]], %vector.body ] -; CHECK-ORDERED-TF: %[[VEC_IND:.*]] = phi [ %[[INDUCTION]], %vector.ph ], [ {{.*}}, %vector.body ] -; CHECK-ORDERED-TF: %[[GEP1:.*]] = getelementptr inbounds float, float* %b, %[[VEC_IND]] -; CHECK-ORDERED-TF: %[[MGATHER1:.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( %[[GEP1]], i32 4, %[[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF: %[[OR:.*]] = or %[[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF: %[[GEP2:.*]] = getelementptr inbounds float, float* %b, %[[OR]] -; CHECK-ORDERED-TF: %[[MGATHER2:.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( %[[GEP2]], i32 4, %[[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF: %[[SEL2:.*]] = select %[[ACTIVE_LANE_MASK]], %[[MGATHER2]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF: %[[RDX2]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI2]], %[[SEL2]]) -; CHECK-ORDERED-TF: %[[SEL1:.*]] = select %[[ACTIVE_LANE_MASK]], %[[MGATHER1]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF: %[[RDX1]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI1]], %[[SEL1]]) - -; CHECK-UNORDERED-LABEL: @fadd_strict_interleave -; CHECK-UNORDERED: entry -; CHECK-UNORDERED: %[[ARRAYIDX:.*]] = getelementptr inbounds float, float* %a, i64 1 -; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float* %a -; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float* %[[ARRAYIDX]] -; CHECK-UNORDERED: vector.ph -; CHECK-UNORDERED: %[[INS_ELT2:.*]] = insertelement shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float %[[LOAD2]], i32 0 -; CHECK-UNORDERED: %[[INS_ELT1:.*]] = insertelement shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float %[[LOAD1]], i32 0 -; CHECK-UNORDERED: %[[STEPVEC1:.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-UNORDERED: %[[STEPVEC_ADD1:.*]] = add %[[STEPVEC1]], zeroinitializer -; CHECK-UNORDERED: %[[STEPVEC_MUL:.*]] = mul %[[STEPVEC_ADD1]], shufflevector ( insertelement ( poison, i64 2, i64 0), poison, zeroinitializer) -; CHECK-UNORDERED: %[[INDUCTION:.*]] = add zeroinitializer, %[[STEPVEC_MUL]] -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi [ %[[INS_ELT2]], %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi [ %[[INS_ELT1]], %vector.ph ], [ %[[VEC_FADD1:.*]], %vector.body ] -; CHECK-UNORDERED: %[[GEP1:.*]] = getelementptr inbounds float, float* %b, -; CHECK-UNORDERED: %[[MGATHER1:.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( %[[GEP1]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-UNORDERED: %[[VEC_FADD1]] = fadd %[[MGATHER1]], %[[VEC_PHI1]] -; CHECK-UNORDERED: %[[OR:.*]] = or {{.*}}, shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-UNORDERED: %[[GEP2:.*]] = getelementptr inbounds float, float* %b, %[[OR]] -; CHECK-UNORDERED: %[[MGATHER2:.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( %[[GEP2]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd %[[MGATHER2]], %[[VEC_PHI2]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[VEC_RDX1:.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, %[[VEC_FADD1]]) -; CHECK-UNORDERED: %[[VEC_RDX2:.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, %[[VEC_FADD2]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[LOAD3:.*]] = load float, float* -; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %[[LOAD3]], {{.*}} -; CHECK-UNORDERED: %[[LOAD4:.*]] = load float, float* -; CHECK-UNORDERED: %[[FADD2:.*]] = fadd float %[[LOAD4]], {{.*}} -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[RDX1:.*]] = phi float [ %[[FADD1]], %for.body ], [ %[[VEC_RDX1]], %middle.block ] -; CHECK-UNORDERED: %[[RDX2:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[VEC_RDX2]], %middle.block ] -; CHECK-UNORDERED: store float %[[RDX1]], float* %a -; CHECK-UNORDERED: store float %[[RDX2]], float* {{.*}} -; CHECK-UNORDERED: ret void - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_interleave -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define void @fadd_strict_interleave +; CHECK-NOT-VECTORIZED-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 +; CHECK-NOT-VECTORIZED-NEXT: [[A1:%.*]] = load float, float* [[A]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[A2:%.*]] = load float, float* [[ARRAYIDXA]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[A2]], [[ENTRY:%.*]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ADD_PHI2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDXB1]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD1]] = fadd float [[TMP0]], [[ADD_PHI2]] +; CHECK-NOT-VECTORIZED-NEXT: [[OR:%.*]] = or i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[OR]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDXB2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD2]] = fadd float [[TMP1]], [[ADD_PHI1]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: store float [[ADD1_LCSSA]], float* [[A]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: store float [[ADD2_LCSSA]], float* [[ARRAYIDXA]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: ret void +; +; CHECK-UNORDERED-LABEL: define void @fadd_strict_interleave +; CHECK-UNORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 +; CHECK-UNORDERED-NEXT: [[A1:%.*]] = load float, float* [[A]], align 4 +; CHECK-UNORDERED-NEXT: [[A2:%.*]] = load float, float* [[ARRAYIDXA]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[N]], -2 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = insertelement shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float [[A2]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = insertelement shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float [[A1]], i32 0 +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]] +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_VEC:%.*]] = load , * [[TMP12]], align 4 +; CHECK-UNORDERED-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8f32( [[WIDE_VEC]]) +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-UNORDERED-NEXT: [[TMP15]] = fadd [[TMP13]], [[VEC_PHI1]] +; CHECK-UNORDERED-NEXT: [[TMP16]] = fadd [[TMP14]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 4 +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP15]]) +; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP16]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ADD_PHI2:%.*]] = phi float [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = load float, float* [[ARRAYIDXB1]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD1]] = fadd float [[TMP22]], [[ADD_PHI2]] +; CHECK-UNORDERED-NEXT: [[OR:%.*]] = or i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[OR]] +; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = load float, float* [[ARRAYIDXB2]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD2]] = fadd float [[TMP23]], [[ADD_PHI1]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: store float [[ADD1_LCSSA]], float* [[A]], align 4 +; CHECK-UNORDERED-NEXT: store float [[ADD2_LCSSA]], float* [[ARRAYIDXA]], align 4 +; CHECK-UNORDERED-NEXT: ret void +; +; CHECK-ORDERED-LABEL: define void @fadd_strict_interleave +; CHECK-ORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 +; CHECK-ORDERED-NEXT: [[A1:%.*]] = load float, float* [[A]], align 4 +; CHECK-ORDERED-NEXT: [[A2:%.*]] = load float, float* [[ARRAYIDXA]], align 4 +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[N]], -2 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2 +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP7]] +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to * +; CHECK-ORDERED-NEXT: [[WIDE_VEC:%.*]] = load , * [[TMP10]], align 4 +; CHECK-ORDERED-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8f32( [[WIDE_VEC]]) +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-ORDERED-NEXT: [[TMP13]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP12]]) +; CHECK-ORDERED-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], [[TMP11]]) +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ADD_PHI2:%.*]] = phi float [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDXB1]], align 4 +; CHECK-ORDERED-NEXT: [[ADD1]] = fadd float [[TMP18]], [[ADD_PHI2]] +; CHECK-ORDERED-NEXT: [[OR:%.*]] = or i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[OR]] +; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = load float, float* [[ARRAYIDXB2]], align 4 +; CHECK-ORDERED-NEXT: [[ADD2]] = fadd float [[TMP19]], [[ADD_PHI1]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: store float [[ADD1_LCSSA]], float* [[A]], align 4 +; CHECK-ORDERED-NEXT: store float [[ADD2_LCSSA]], float* [[ARRAYIDXA]], align 4 +; CHECK-ORDERED-NEXT: ret void +; +; CHECK-ORDERED-TF-LABEL: define void @fadd_strict_interleave +; CHECK-ORDERED-TF-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-ORDERED-TF-NEXT: entry: +; CHECK-ORDERED-TF-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 +; CHECK-ORDERED-TF-NEXT: [[A1:%.*]] = load float, float* [[A]], align 4 +; CHECK-ORDERED-TF-NEXT: [[A2:%.*]] = load float, float* [[ARRAYIDXA]], align 4 +; CHECK-ORDERED-TF-NEXT: [[TMP0:%.*]] = add i64 [[N]], -2 +; CHECK-ORDERED-TF-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1 +; CHECK-ORDERED-TF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED-TF: vector.ph: +; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1 +; CHECK-ORDERED-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP2]], [[TMP7]] +; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]] +; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-ORDERED-TF-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2 +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]] +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[TMP2]], [[TMP9]] +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]]) +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = add [[TMP13]], zeroinitializer +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = mul [[TMP14]], shufflevector ( insertelement ( poison, i64 2, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP15]] +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = mul i64 2, [[TMP17]] +; CHECK-ORDERED-TF-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP18]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED-TF: vector.body: +; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[B]], [[VEC_IND]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( [[TMP19]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = or [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[B]], [[TMP20]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( [[TMP21]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_GATHER2]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP23]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP22]]) +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP25]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], [[TMP24]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]]) +; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 4 +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP27]] +; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = extractelement [[TMP28]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-ORDERED-TF: middle.block: +; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED-TF: scalar.ph: +; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX3:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED-TF: for.body: +; CHECK-ORDERED-TF-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ADD_PHI2:%.*]] = phi float [ [[BC_MERGE_RDX3]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = load float, float* [[ARRAYIDXB1]], align 4 +; CHECK-ORDERED-TF-NEXT: [[ADD1]] = fadd float [[TMP30]], [[ADD_PHI2]] +; CHECK-ORDERED-TF-NEXT: [[OR:%.*]] = or i64 [[IV]], 1 +; CHECK-ORDERED-TF-NEXT: [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[OR]] +; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = load float, float* [[ARRAYIDXB2]], align 4 +; CHECK-ORDERED-TF-NEXT: [[ADD2]] = fadd float [[TMP31]], [[ADD_PHI1]] +; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2 +; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-ORDERED-TF: for.end: +; CHECK-ORDERED-TF-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: store float [[ADD1_LCSSA]], float* [[A]], align 4 +; CHECK-ORDERED-TF-NEXT: store float [[ADD2_LCSSA]], float* [[ARRAYIDXA]], align 4 +; CHECK-ORDERED-TF-NEXT: ret void +; + + + entry: %arrayidxa = getelementptr inbounds float, float* %a, i64 1 @@ -272,57 +846,237 @@ } define float @fadd_of_sum(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 { -; CHECK-ORDERED-LABEL: @fadd_of_sum -; CHECK-ORDERED: vector.body -; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ] -; CHECK-ORDERED: %[[LOAD1:.*]] = load , * -; CHECK-ORDERED: %[[LOAD2:.*]] = load , * -; CHECK-ORDERED: %[[ADD:.*]] = fadd %[[LOAD1]], %[[LOAD2]] -; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI1]], %[[ADD]]) -; CHECK-ORDERED: for.end.loopexit -; CHECK-ORDERED: %[[EXIT_PHI:.*]] = phi float [ {{.*}}, %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-ORDERED: for.end -; CHECK-ORDERED: %[[PHI:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT_PHI]], %for.end.loopexit ] -; CHECK-ORDERED: ret float %[[PHI]] - -; CHECK-ORDERED-TF-LABEL: @fadd_of_sum -; CHECK-ORDERED-TF: vector.body -; CHECK-ORDERED-TF: %[[ACTIVE_LANE_MASK:.*]] = phi -; CHECK-ORDERED-TF: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ] -; CHECK-ORDERED-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* -; CHECK-ORDERED-TF: %[[LOAD2:.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* -; CHECK-ORDERED-TF: %[[ADD:.*]] = fadd %[[LOAD1]], %[[LOAD2]] -; CHECK-ORDERED-TF: %[[SEL:.*]] = select %[[ACTIVE_LANE_MASK]], %[[ADD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF: %[[RDX]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI1]], %[[SEL]]) -; CHECK-ORDERED-TF: for.end.loopexit -; CHECK-ORDERED-TF: %[[EXIT_PHI:.*]] = phi float [ {{.*}}, %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-ORDERED-TF: for.end -; CHECK-ORDERED-TF: %[[PHI:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT_PHI]], %for.end.loopexit ] -; CHECK-ORDERED-TF: ret float %[[PHI]] - -; CHECK-UNORDERED-LABEL: @fadd_of_sum -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load , * -; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load , * -; CHECK-UNORDERED: %[[VEC_FADD1:.*]] = fadd %[[VEC_LOAD1]], %[[VEC_LOAD2]] -; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd %[[VEC_PHI]], %[[VEC_FADD1]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, %[[VEC_FADD2]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float* -; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float* -; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %[[LOAD1]], %[[LOAD2]] -; CHECK-UNORDERED: %[[FADD2:.*]] = fadd float {{.*}}, %[[FADD1]] -; CHECK-UNORDERED: for.end.loopexit -; CHECK-UNORDERED: %[[EXIT:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT]], %for.end.loopexit ] -; CHECK-UNORDERED: ret float %[[SUM]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_of_sum -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_of_sum +; CHECK-NOT-VECTORIZED-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 5.000000e-01 +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK-NOT-VECTORIZED: for.body.preheader: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD:%.*]] = fadd float [[TMP1]], [[TMP2]] +; CHECK-NOT-VECTORIZED-NEXT: [[RDX]] = fadd float [[RES_014]], [[ADD]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6]] +; CHECK-NOT-VECTORIZED: for.end.loopexit: +; CHECK-NOT-VECTORIZED-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_END]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[RES]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_of_sum +; CHECK-UNORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 5.000000e-01 +; CHECK-UNORDERED-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK-UNORDERED: for.body.preheader: +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]] +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]] +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP8]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP5]] +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , * [[TMP11]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = fadd [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-UNORDERED-NEXT: [[TMP13]] = fadd [[VEC_PHI]], [[TMP12]] +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP13]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-UNORDERED-NEXT: [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = load float, float* [[ARRAYIDX4]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD:%.*]] = fadd float [[TMP18]], [[TMP19]] +; CHECK-UNORDERED-NEXT: [[RDX]] = fadd float [[RES_014]], [[ADD]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-UNORDERED: for.end.loopexit: +; CHECK-UNORDERED-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_END]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-UNORDERED-NEXT: ret float [[RES]] +; +; CHECK-ORDERED-LABEL: define float @fadd_of_sum +; CHECK-ORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 5.000000e-01 +; CHECK-ORDERED-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK-ORDERED: for.body.preheader: +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]] +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]] +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]] +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP8]], align 4 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP5]] +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , * [[TMP11]], align 4 +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = fadd [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-ORDERED-NEXT: [[TMP13]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP12]]) +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-ORDERED-NEXT: [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX4]], align 4 +; CHECK-ORDERED-NEXT: [[ADD:%.*]] = fadd float [[TMP17]], [[TMP18]] +; CHECK-ORDERED-NEXT: [[RDX]] = fadd float [[RES_014]], [[ADD]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-ORDERED: for.end.loopexit: +; CHECK-ORDERED-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_END]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-ORDERED-NEXT: ret float [[RES]] +; +; CHECK-ORDERED-TF-LABEL: define float @fadd_of_sum +; CHECK-ORDERED-TF-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-ORDERED-TF-NEXT: entry: +; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 +; CHECK-ORDERED-TF-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 5.000000e-01 +; CHECK-ORDERED-TF-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK-ORDERED-TF: for.body.preheader: +; CHECK-ORDERED-TF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED-TF: vector.ph: +; CHECK-ORDERED-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], 1 +; CHECK-ORDERED-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP5]] +; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]] +; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = sub i64 [[N]], [[TMP7]] +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[N]], [[TMP7]] +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED-TF: vector.body: +; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]] +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP13]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP11]] +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP15]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP17]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = fadd [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP20]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP19]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP10]]) +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP22]] +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = extractelement [[TMP23]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-ORDERED-TF: middle.block: +; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED-TF: scalar.ph: +; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED-TF: for.body: +; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-ORDERED-TF-NEXT: [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = load float, float* [[ARRAYIDX4]], align 4 +; CHECK-ORDERED-TF-NEXT: [[ADD:%.*]] = fadd float [[TMP25]], [[TMP26]] +; CHECK-ORDERED-TF-NEXT: [[RDX]] = fadd float [[RES_014]], [[ADD]] +; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-ORDERED-TF: for.end.loopexit: +; CHECK-ORDERED-TF-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: br label [[FOR_END]] +; CHECK-ORDERED-TF: for.end: +; CHECK-ORDERED-TF-NEXT: [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-ORDERED-TF-NEXT: ret float [[RES]] +; + + + entry: %arrayidx = getelementptr inbounds float, float* %a, i64 1 @@ -349,77 +1103,234 @@ } define float @fadd_conditional(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 { -; CHECK-ORDERED-LABEL: @fadd_conditional -; CHECK-ORDERED: vector.body -; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 1.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ] -; CHECK-ORDERED: %[[LOAD:.*]] = load , * -; CHECK-ORDERED: %[[FCMP:.*]] = fcmp une %[[LOAD]], zeroinitializer -; CHECK-ORDERED: %[[MASKED_LOAD:.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* {{.*}}, i32 4, %[[FCMP]], poison) -; CHECK-ORDERED: %[[XOR:.*]] = xor %[[FCMP]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED: %[[SELECT:.*]] = select %[[XOR]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), %[[MASKED_LOAD]] -; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI]], %[[SELECT]]) -; CHECK-ORDERED: scalar.ph -; CHECK-ORDERED: %[[MERGE_RDX:.*]] = phi float [ 1.000000e+00, %entry ], [ %[[RDX]], %middle.block ] -; CHECK-ORDERED: for.body -; CHECK-ORDERED: %[[RES:.*]] = phi float [ %[[MERGE_RDX]], %scalar.ph ], [ %[[FADD:.*]], %for.inc ] -; CHECK-ORDERED: if.then -; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float* -; CHECK-ORDERED: for.inc -; CHECK-ORDERED: %[[PHI:.*]] = phi float [ %[[LOAD2]], %if.then ], [ 3.000000e+00, %for.body ] -; CHECK-ORDERED: %[[FADD]] = fadd float %[[RES]], %[[PHI]] -; CHECK-ORDERED: for.end -; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ] -; CHECK-ORDERED: ret float %[[RDX_PHI]] - -; CHECK-ORDERED-TF-LABEL: @fadd_conditional -; CHECK-ORDERED-TF: vector.body -; CHECK-ORDERED-TF: %[[ACTIVE_LANE_MASK:.*]] = phi -; CHECK-ORDERED-TF: %[[VEC_PHI:.*]] = phi float [ 1.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ] -; CHECK-ORDERED-TF: %[[LOAD:.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* -; CHECK-ORDERED-TF: %[[FCMP:.*]] = fcmp une %[[LOAD]], zeroinitializer -; CHECK-ORDERED-TF: %[[SELECT0:.*]] = select %[[ACTIVE_LANE_MASK]], %[[FCMP]], zeroinitializer -; CHECK-ORDERED-TF: %[[MASKED_LOAD:.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* {{.*}}, i32 4, %[[SELECT0]], poison) -; CHECK-ORDERED-TF: %[[XOR:.*]] = xor %[[FCMP]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF: %[[SELECT1:.*]] = select %[[ACTIVE_LANE_MASK]], %[[XOR]], zeroinitializer -; CHECK-ORDERED-TF: %[[SELECT2:.*]] = select %[[SELECT1]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), %[[MASKED_LOAD]] -; CHECK-ORDERED-TF: %[[OR:.*]] = or %[[SELECT0]], %[[SELECT1]] -; CHECK-ORDERED-TF: %[[SELECT3:.*]] = select %[[OR]], %[[SELECT2]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF: %[[RDX]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI]], %[[SELECT3]]) -; CHECK-ORDERED-TF: scalar.ph -; CHECK-ORDERED-TF: %[[MERGE_RDX:.*]] = phi float [ 1.000000e+00, %entry ], [ %[[RDX]], %middle.block ] -; CHECK-ORDERED-TF: for.body -; CHECK-ORDERED-TF: %[[RES:.*]] = phi float [ %[[MERGE_RDX]], %scalar.ph ], [ %[[FADD:.*]], %for.inc ] -; CHECK-ORDERED-TF: if.then -; CHECK-ORDERED-TF: %[[LOAD2:.*]] = load float, float* -; CHECK-ORDERED-TF: for.inc -; CHECK-ORDERED-TF: %[[PHI:.*]] = phi float [ %[[LOAD2]], %if.then ], [ 3.000000e+00, %for.body ] -; CHECK-ORDERED-TF: %[[FADD]] = fadd float %[[RES]], %[[PHI]] -; CHECK-ORDERED-TF: for.end -; CHECK-ORDERED-TF: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ] -; CHECK-ORDERED-TF: ret float %[[RDX_PHI]] - -; CHECK-UNORDERED-LABEL: @fadd_conditional -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 1.000000e+00, i32 0), %vector.ph ], [ %[[VEC_FADD:.*]], %vector.body ] -; CHECK-UNORDERED: %[[LOAD1:.*]] = load , * -; CHECK-UNORDERED: %[[FCMP:.*]] = fcmp une %[[LOAD1]], zeroinitializer -; CHECK-UNORDERED: %[[MASKED_LOAD:.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* {{.*}}, i32 4, %[[FCMP]], poison) -; CHECK-UNORDERED: %[[XOR:.*]] = xor %[[FCMP]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-UNORDERED: %[[SELECT:.*]] = select %[[XOR]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), %[[MASKED_LOAD]] -; CHECK-UNORDERED: %[[VEC_FADD]] = fadd %[[VEC_PHI]], %[[SELECT]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, %[[VEC_FADD]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD:.*]], %for.inc ] -; CHECK-UNORDERED: for.inc -; CHECK-UNORDERED: %[[FADD]] = fadd float %[[RES]], {{.*}} -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float %[[RDX_PHI]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_conditional -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_conditional +; CHECK-NOT-VECTORIZED-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[RES:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[FADD:%.*]], [[FOR_INC]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP0]], 0.000000e+00 +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; CHECK-NOT-VECTORIZED: if.then: +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_INC]] +; CHECK-NOT-VECTORIZED: for.inc: +; CHECK-NOT-VECTORIZED-NEXT: [[PHI:%.*]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[FADD]] = fadd float [[RES]], [[PHI]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[RDX]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_conditional +; CHECK-UNORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 1.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP4]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP7]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = fcmp une [[WIDE_LOAD]], zeroinitializer +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr float, float* [[A]], i64 [[TMP4]] +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[TMP9]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP11]], i32 4, [[TMP8]], poison) +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = xor [[TMP8]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-UNORDERED-NEXT: [[PREDPHI:%.*]] = select [[TMP12]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD]] +; CHECK-UNORDERED-NEXT: [[TMP13]] = fadd [[VEC_PHI]], [[PREDPHI]] +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP13]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-UNORDERED-NEXT: [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP18]], 0.000000e+00 +; CHECK-UNORDERED-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; CHECK-UNORDERED: if.then: +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: br label [[FOR_INC]] +; CHECK-UNORDERED: for.inc: +; CHECK-UNORDERED-NEXT: [[PHI:%.*]] = phi float [ [[TMP19]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[FADD]] = fadd float [[RES]], [[PHI]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[RDX]] +; +; CHECK-ORDERED-LABEL: define float @fadd_conditional +; CHECK-ORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP4]] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP7]], align 4 +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = fcmp une [[WIDE_LOAD]], zeroinitializer +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = getelementptr float, float* [[A]], i64 [[TMP4]] +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[TMP9]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to * +; CHECK-ORDERED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP11]], i32 4, [[TMP8]], poison) +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = xor [[TMP8]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-NEXT: [[PREDPHI:%.*]] = select [[TMP12]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD]] +; CHECK-ORDERED-NEXT: [[TMP13]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[PREDPHI]]) +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-ORDERED-NEXT: [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP17]], 0.000000e+00 +; CHECK-ORDERED-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; CHECK-ORDERED: if.then: +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: br label [[FOR_INC]] +; CHECK-ORDERED: for.inc: +; CHECK-ORDERED-NEXT: [[PHI:%.*]] = phi float [ [[TMP18]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[FADD]] = fadd float [[RES]], [[PHI]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[RDX]] +; +; CHECK-ORDERED-TF-LABEL: define float @fadd_conditional +; CHECK-ORDERED-TF-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-ORDERED-TF-NEXT: entry: +; CHECK-ORDERED-TF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED-TF: vector.ph: +; CHECK-ORDERED-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-ORDERED-TF-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-ORDERED-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]] +; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED-TF: vector.body: +; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP10]] +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = fcmp une [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = getelementptr float, float* [[A]], i64 [[TMP10]] +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP14]], zeroinitializer +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = getelementptr float, float* [[TMP15]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = bitcast float* [[TMP17]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP18]], i32 4, [[TMP16]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = xor [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP19]], zeroinitializer +; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select [[TMP20]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD1]] +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = or [[TMP16]], [[TMP20]] +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = select [[TMP21]], [[PREDPHI]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP23]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP22]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 4 +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP25]] +; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = extractelement [[TMP26]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-ORDERED-TF: middle.block: +; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED-TF: scalar.ph: +; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED-TF: for.body: +; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-ORDERED-TF-NEXT: [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ] +; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP28]], 0.000000e+00 +; CHECK-ORDERED-TF-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; CHECK-ORDERED-TF: if.then: +; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-TF-NEXT: br label [[FOR_INC]] +; CHECK-ORDERED-TF: for.inc: +; CHECK-ORDERED-TF-NEXT: [[PHI:%.*]] = phi float [ [[TMP29]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[FADD]] = fadd float [[RES]], [[PHI]] +; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-ORDERED-TF: for.end: +; CHECK-ORDERED-TF-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: ret float [[RDX]] +; + + + entry: br label %for.body @@ -451,33 +1362,125 @@ ; Negative test - loop contains multiple fadds which we cannot safely reorder define float @fadd_multiple(float* noalias nocapture %a, float* noalias nocapture %b, i64 %n) #0 { -; CHECK-ORDERED-LABEL: @fadd_multiple -; CHECK-ORDERED-NOT: vector.body - -; CHECK-ORDERED-TF-LABEL: @fadd_multiple -; CHECK-ORDERED-TF-NOT: vector.body - -; CHECK-UNORDERED-LABEL: @fadd_multiple -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[PHI:.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float -0.000000e+00, i32 0), %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load , -; CHECK-UNORDERED: %[[VEC_FADD1:.*]] = fadd %[[PHI]], %[[VEC_LOAD1]] -; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load , -; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd %[[VEC_FADD1]], %[[VEC_LOAD2]] -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, %[[VEC_FADD2]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD2:.*]], %for.body ] -; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float* -; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %[[SUM]], %[[LOAD1]] -; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float* -; CHECK-UNORDERED: %[[FADD2]] = fadd float %[[FADD1]], %[[LOAD2]] -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float %[[RET]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_multiple +; CHECK-NOT-VECTORIZED-SAME: (float* noalias nocapture [[A:%.*]], float* noalias nocapture [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD3]] = fadd float [[ADD]], [[TMP1]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[RDX]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_multiple +; CHECK-UNORDERED-SAME: (float* noalias nocapture [[A:%.*]], float* noalias nocapture [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float -0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP7]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = fadd [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP4]] +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , * [[TMP11]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP12]] = fadd [[TMP8]], [[WIDE_LOAD1]] +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 8 +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[TMP12]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP17]] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD3]] = fadd float [[ADD]], [[TMP18]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[RDX]] +; +; CHECK-ORDERED-LABEL: define float @fadd_multiple +; CHECK-ORDERED-SAME: (float* noalias nocapture [[A:%.*]], float* noalias nocapture [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]] +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[ADD3]] = fadd float [[ADD]], [[TMP1]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: ret float [[RDX]] +; +; CHECK-ORDERED-TF-LABEL: define float @fadd_multiple +; CHECK-ORDERED-TF-SAME: (float* noalias nocapture [[A:%.*]], float* noalias nocapture [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-ORDERED-TF-NEXT: entry: +; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED-TF: for.body: +; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-TF-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]] +; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-TF-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-TF-NEXT: [[ADD3]] = fadd float [[ADD]], [[TMP1]] +; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-ORDERED-TF: for.end: +; CHECK-ORDERED-TF-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: ret float [[RDX]] +; + + + entry: br label %for.body @@ -502,95 +1505,422 @@ ; Test case where loop has a call to the llvm.fmuladd intrinsic. define float @fmuladd_strict(float* %a, float* %b, i64 %n) #0 { -; CHECK-ORDERED-LABEL: @fmuladd_strict -; CHECK-ORDERED: vector.body: -; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX3:%.*]], %vector.body ] -; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load , * -; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load , * -; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load , * -; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load , * -; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load , * -; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load , * -; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load , * -; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load , * -; CHECK-ORDERED: [[FMUL:%.*]] = fmul [[WIDE_LOAD]], [[WIDE_LOAD4]] -; CHECK-ORDERED: [[FMUL1:%.*]] = fmul [[WIDE_LOAD1]], [[WIDE_LOAD5]] -; CHECK-ORDERED: [[FMUL2:%.*]] = fmul [[WIDE_LOAD2]], [[WIDE_LOAD6]] -; CHECK-ORDERED: [[FMUL3:%.*]] = fmul [[WIDE_LOAD3]], [[WIDE_LOAD7]] -; CHECK-ORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[FMUL]]) -; CHECK-ORDERED: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX]], [[FMUL1]]) -; CHECK-ORDERED: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX1]], [[FMUL2]]) -; CHECK-ORDERED: [[RDX3]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX2]], [[FMUL3]]) -; CHECK-ORDERED: for.end -; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ] -; CHECK-ORDERED: ret float [[RES]] - -; CHECK-ORDERED-TF-LABEL: @fmuladd_strict -; CHECK-ORDERED-TF: vector.body: -; CHECK-ORDERED-TF: [[ACTIVE_LANE_MASK:%.*]] = phi -; CHECK-ORDERED-TF: [[ACTIVE_LANE_MASK1:%.*]] = phi -; CHECK-ORDERED-TF: [[ACTIVE_LANE_MASK2:%.*]] = phi -; CHECK-ORDERED-TF: [[ACTIVE_LANE_MASK3:%.*]] = phi -; CHECK-ORDERED-TF: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX3:%.*]], %vector.body ] -; CHECK-ORDERED-TF: [[WIDE_LOAD:%.*]] = call @llvm.masked.load.nxv8f32 -; CHECK-ORDERED-TF: [[WIDE_LOAD1:%.*]] = call @llvm.masked.load.nxv8f32 -; CHECK-ORDERED-TF: [[WIDE_LOAD2:%.*]] = call @llvm.masked.load.nxv8f32 -; CHECK-ORDERED-TF: [[WIDE_LOAD3:%.*]] = call @llvm.masked.load.nxv8f32 -; CHECK-ORDERED-TF: [[WIDE_LOAD4:%.*]] = call @llvm.masked.load.nxv8f32 -; CHECK-ORDERED-TF: [[WIDE_LOAD5:%.*]] = call @llvm.masked.load.nxv8f32 -; CHECK-ORDERED-TF: [[WIDE_LOAD6:%.*]] = call @llvm.masked.load.nxv8f32 -; CHECK-ORDERED-TF: [[WIDE_LOAD7:%.*]] = call @llvm.masked.load.nxv8f32 -; CHECK-ORDERED-TF: [[FMUL:%.*]] = fmul [[WIDE_LOAD]], [[WIDE_LOAD4]] -; CHECK-ORDERED-TF: [[FMUL1:%.*]] = fmul [[WIDE_LOAD1]], [[WIDE_LOAD5]] -; CHECK-ORDERED-TF: [[FMUL2:%.*]] = fmul [[WIDE_LOAD2]], [[WIDE_LOAD6]] -; CHECK-ORDERED-TF: [[FMUL3:%.*]] = fmul [[WIDE_LOAD3]], [[WIDE_LOAD7]] -; CHECK-ORDERED-TF: [[SEL:%.*]] = select [[ACTIVE_LANE_MASK]], [[FMUL]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[SEL]]) -; CHECK-ORDERED-TF: [[SEL1:%.*]] = select [[ACTIVE_LANE_MASK1]], [[FMUL1]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX]], [[SEL1]]) -; CHECK-ORDERED-TF: [[SEL2:%.*]] = select [[ACTIVE_LANE_MASK2]], [[FMUL2]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX1]], [[SEL2]]) -; CHECK-ORDERED-TF: [[SEL3:%.*]] = select [[ACTIVE_LANE_MASK3]], [[FMUL3]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF: [[RDX3]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX2]], [[SEL3]]) -; CHECK-ORDERED-TF: for.end -; CHECK-ORDERED-TF: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ] -; CHECK-ORDERED-TF: ret float [[RES]] - -; CHECK-UNORDERED-LABEL: @fmuladd_strict -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ] -; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load , * -; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load , * -; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load , * -; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load , * -; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load , * -; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load , * -; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load , * -; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load , * -; CHECK-UNORDERED: [[FMULADD]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD]], [[WIDE_LOAD4]], [[VEC_PHI]]) -; CHECK-UNORDERED: [[FMULADD1]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD1]], [[WIDE_LOAD5]], [[VEC_PHI1]]) -; CHECK-UNORDERED: [[FMULADD2]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD2]], [[WIDE_LOAD6]], [[VEC_PHI2]]) -; CHECK-UNORDERED: [[FMULADD3]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD3]], [[WIDE_LOAD7]], [[VEC_PHI3]]) -; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd [[FMULADD1]], [[FMULADD]] -; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd [[FMULADD2]], [[BIN_RDX]] -; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd [[FMULADD3]], [[BIN_RDX1]] -; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[BIN_RDX2]] -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[SCALAR:%.*]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ] -; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float* -; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, float* -; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]]) -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float [[RES]] - -; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fmuladd_strict +; CHECK-NOT-VECTORIZED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]]) +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fmuladd_strict +; CHECK-UNORDERED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32 +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP56:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP57:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP58:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP59:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]] +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24 +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] +; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]] +; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]] +; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]] +; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]] +; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP25]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 8 +; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[TMP27]] +; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , * [[TMP29]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 16 +; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[TMP31]] +; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , * [[TMP33]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 24 +; CHECK-UNORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[TMP35]] +; CHECK-UNORDERED-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , * [[TMP37]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP4]] +; CHECK-UNORDERED-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]] +; CHECK-UNORDERED-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP14]] +; CHECK-UNORDERED-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP19]] +; CHECK-UNORDERED-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP43:%.*]] = bitcast float* [[TMP42]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , * [[TMP43]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 8 +; CHECK-UNORDERED-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 [[TMP45]] +; CHECK-UNORDERED-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD8:%.*]] = load , * [[TMP47]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP48:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP49:%.*]] = mul i64 [[TMP48]], 16 +; CHECK-UNORDERED-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 [[TMP49]] +; CHECK-UNORDERED-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD9:%.*]] = load , * [[TMP51]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 24 +; CHECK-UNORDERED-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 [[TMP53]] +; CHECK-UNORDERED-NEXT: [[TMP55:%.*]] = bitcast float* [[TMP54]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD10:%.*]] = load , * [[TMP55]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP56]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD]], [[WIDE_LOAD7]], [[VEC_PHI]]) +; CHECK-UNORDERED-NEXT: [[TMP57]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD4]], [[WIDE_LOAD8]], [[VEC_PHI1]]) +; CHECK-UNORDERED-NEXT: [[TMP58]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD5]], [[WIDE_LOAD9]], [[VEC_PHI2]]) +; CHECK-UNORDERED-NEXT: [[TMP59]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD6]], [[WIDE_LOAD10]], [[VEC_PHI3]]) +; CHECK-UNORDERED-NEXT: [[TMP60:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP61:%.*]] = mul i64 [[TMP60]], 32 +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP61]] +; CHECK-UNORDERED-NEXT: [[TMP62:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP62]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd [[TMP57]], [[TMP56]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX11:%.*]] = fadd [[TMP58]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX12:%.*]] = fadd [[TMP59]], [[BIN_RDX11]] +; CHECK-UNORDERED-NEXT: [[TMP63:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[BIN_RDX12]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP64:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP65:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP64]], float [[TMP65]], float [[SUM_07]]) +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[MULADD_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fmuladd_strict +; CHECK-ORDERED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32 +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP63:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]] +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24 +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 +; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 +; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] +; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]] +; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]] +; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]] +; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]] +; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP25]], align 4 +; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 8 +; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[TMP27]] +; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , * [[TMP29]], align 4 +; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 16 +; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[TMP31]] +; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load , * [[TMP33]], align 4 +; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 24 +; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[TMP35]] +; CHECK-ORDERED-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load , * [[TMP37]], align 4 +; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP4]] +; CHECK-ORDERED-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]] +; CHECK-ORDERED-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP14]] +; CHECK-ORDERED-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP19]] +; CHECK-ORDERED-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP43:%.*]] = bitcast float* [[TMP42]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , * [[TMP43]], align 4 +; CHECK-ORDERED-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 8 +; CHECK-ORDERED-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 [[TMP45]] +; CHECK-ORDERED-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , * [[TMP47]], align 4 +; CHECK-ORDERED-NEXT: [[TMP48:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP49:%.*]] = mul i64 [[TMP48]], 16 +; CHECK-ORDERED-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 [[TMP49]] +; CHECK-ORDERED-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , * [[TMP51]], align 4 +; CHECK-ORDERED-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 24 +; CHECK-ORDERED-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 [[TMP53]] +; CHECK-ORDERED-NEXT: [[TMP55:%.*]] = bitcast float* [[TMP54]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , * [[TMP55]], align 4 +; CHECK-ORDERED-NEXT: [[TMP56:%.*]] = fmul [[WIDE_LOAD]], [[WIDE_LOAD4]] +; CHECK-ORDERED-NEXT: [[TMP57:%.*]] = fmul [[WIDE_LOAD1]], [[WIDE_LOAD5]] +; CHECK-ORDERED-NEXT: [[TMP58:%.*]] = fmul [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; CHECK-ORDERED-NEXT: [[TMP59:%.*]] = fmul [[WIDE_LOAD3]], [[WIDE_LOAD7]] +; CHECK-ORDERED-NEXT: [[TMP60:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP56]]) +; CHECK-ORDERED-NEXT: [[TMP61:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP60]], [[TMP57]]) +; CHECK-ORDERED-NEXT: [[TMP62:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP61]], [[TMP58]]) +; CHECK-ORDERED-NEXT: [[TMP63]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP62]], [[TMP59]]) +; CHECK-ORDERED-NEXT: [[TMP64:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP65:%.*]] = mul i64 [[TMP64]], 32 +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP65]] +; CHECK-ORDERED-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP66]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP67:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP68:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP67]], float [[TMP68]], float [[SUM_07]]) +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[MULADD_LCSSA]] +; +; CHECK-ORDERED-TF-LABEL: define float @fmuladd_strict +; CHECK-ORDERED-TF-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-ORDERED-TF-NEXT: entry: +; CHECK-ORDERED-TF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED-TF: vector.ph: +; CHECK-ORDERED-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-ORDERED-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]] +; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP8]] +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 24 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP10]] +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = sub i64 [[N]], [[TMP12]] +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[N]], [[TMP12]] +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = sub i64 [[N]], [[TMP17]] +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[N]], [[TMP17]] +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = sub i64 [[N]], [[TMP22]] +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = icmp ugt i64 [[N]], [[TMP22]] +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = sub i64 [[N]], [[TMP27]] +; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = icmp ugt i64 [[N]], [[TMP27]] +; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i64 [[TMP28]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED-TF: vector.body: +; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP94:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = add i64 [[INDEX]], [[TMP35]] +; CHECK-ORDERED-TF-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = add i64 [[TMP38]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], [[TMP40]] +; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = add i64 [[TMP43]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = add i64 [[INDEX]], [[TMP45]] +; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP31]] +; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP36]] +; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP41]] +; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP46]] +; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, float* [[TMP47]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = bitcast float* [[TMP51]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* [[TMP52]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = mul i64 [[TMP53]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = getelementptr inbounds float, float* [[TMP47]], i64 [[TMP54]] +; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = bitcast float* [[TMP55]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* [[TMP56]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = mul i64 [[TMP57]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[TMP47]], i64 [[TMP58]] +; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = bitcast float* [[TMP59]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* [[TMP60]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = mul i64 [[TMP61]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, float* [[TMP47]], i64 [[TMP62]] +; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = bitcast float* [[TMP63]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* [[TMP64]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP31]] +; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP36]] +; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP41]] +; CHECK-ORDERED-TF-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP46]] +; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[TMP65]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = bitcast float* [[TMP69]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* [[TMP70]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = mul i64 [[TMP71]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = getelementptr inbounds float, float* [[TMP65]], i64 [[TMP72]] +; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = bitcast float* [[TMP73]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* [[TMP74]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = mul i64 [[TMP75]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = getelementptr inbounds float, float* [[TMP65]], i64 [[TMP76]] +; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = bitcast float* [[TMP77]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* [[TMP78]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP80:%.*]] = mul i64 [[TMP79]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = getelementptr inbounds float, float* [[TMP65]], i64 [[TMP80]] +; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = bitcast float* [[TMP81]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* [[TMP82]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP83:%.*]] = fmul [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]] +; CHECK-ORDERED-TF-NEXT: [[TMP84:%.*]] = fmul [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]] +; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = fmul [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]] +; CHECK-ORDERED-TF-NEXT: [[TMP86:%.*]] = fmul [[WIDE_MASKED_LOAD11]], [[WIDE_MASKED_LOAD15]] +; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP83]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP87]]) +; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = select [[ACTIVE_LANE_MASK6]], [[TMP84]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP90:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP88]], [[TMP89]]) +; CHECK-ORDERED-TF-NEXT: [[TMP91:%.*]] = select [[ACTIVE_LANE_MASK7]], [[TMP85]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP92:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP90]], [[TMP91]]) +; CHECK-ORDERED-TF-NEXT: [[TMP93:%.*]] = select [[ACTIVE_LANE_MASK8]], [[TMP86]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP94]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP92]], [[TMP93]]) +; CHECK-ORDERED-TF-NEXT: [[TMP95:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP96:%.*]] = mul i64 [[TMP95]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP97:%.*]] = add i64 [[INDEX]], [[TMP96]] +; CHECK-ORDERED-TF-NEXT: [[TMP98:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP99:%.*]] = mul i64 [[TMP98]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP100:%.*]] = add i64 [[INDEX]], [[TMP99]] +; CHECK-ORDERED-TF-NEXT: [[TMP101:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP102:%.*]] = mul i64 [[TMP101]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP103:%.*]] = add i64 [[INDEX]], [[TMP102]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP97]], i64 [[TMP20]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP100]], i64 [[TMP25]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP103]], i64 [[TMP30]]) +; CHECK-ORDERED-TF-NEXT: [[TMP104:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP105:%.*]] = mul i64 [[TMP104]], 32 +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP105]] +; CHECK-ORDERED-TF-NEXT: [[TMP106:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP107:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP108:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP109:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP110:%.*]] = extractelement [[TMP106]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP110]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-ORDERED-TF: middle.block: +; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED-TF: scalar.ph: +; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP94]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED-TF: for.body: +; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-TF-NEXT: [[TMP111:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-TF-NEXT: [[TMP112:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-TF-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP111]], float [[TMP112]], float [[SUM_07]]) +; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-ORDERED-TF: for.end: +; CHECK-ORDERED-TF-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP94]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: ret float [[MULADD_LCSSA]] +; + + + entry: br label %for.body @@ -613,95 +1943,422 @@ ; Same as above but where the call to the llvm.fmuladd intrinsic has a fast-math flag. define float @fmuladd_strict_fmf(float* %a, float* %b, i64 %n) #0 { -; CHECK-ORDERED-LABEL: @fmuladd_strict_fmf -; CHECK-ORDERED: vector.body: -; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX3:%.*]], %vector.body ] -; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load , * -; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load , * -; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load , * -; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load , * -; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load , * -; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load , * -; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load , * -; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load , * -; CHECK-ORDERED: [[FMUL:%.*]] = fmul nnan [[WIDE_LOAD]], [[WIDE_LOAD4]] -; CHECK-ORDERED: [[FMUL1:%.*]] = fmul nnan [[WIDE_LOAD1]], [[WIDE_LOAD5]] -; CHECK-ORDERED: [[FMUL2:%.*]] = fmul nnan [[WIDE_LOAD2]], [[WIDE_LOAD6]] -; CHECK-ORDERED: [[FMUL3:%.*]] = fmul nnan [[WIDE_LOAD3]], [[WIDE_LOAD7]] -; CHECK-ORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[FMUL]]) -; CHECK-ORDERED: [[RDX1:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX]], [[FMUL1]]) -; CHECK-ORDERED: [[RDX2:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX1]], [[FMUL2]]) -; CHECK-ORDERED: [[RDX3]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX2]], [[FMUL3]]) -; CHECK-ORDERED: for.end -; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ] -; CHECK-ORDERED: ret float [[RES]] - -; CHECK-ORDERED-TF-LABEL: @fmuladd_strict_fmf -; CHECK-ORDERED-TF: vector.body: -; CHECK-ORDERED-TF: [[ACTIVE_LANE_MASK:%.*]] = phi -; CHECK-ORDERED-TF: [[ACTIVE_LANE_MASK1:%.*]] = phi -; CHECK-ORDERED-TF: [[ACTIVE_LANE_MASK2:%.*]] = phi -; CHECK-ORDERED-TF: [[ACTIVE_LANE_MASK3:%.*]] = phi -; CHECK-ORDERED-TF: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX3:%.*]], %vector.body ] -; CHECK-ORDERED-TF: [[WIDE_LOAD:%.*]] = call @llvm.masked.load.nxv8f32 -; CHECK-ORDERED-TF: [[WIDE_LOAD1:%.*]] = call @llvm.masked.load.nxv8f32 -; CHECK-ORDERED-TF: [[WIDE_LOAD2:%.*]] = call @llvm.masked.load.nxv8f32 -; CHECK-ORDERED-TF: [[WIDE_LOAD3:%.*]] = call @llvm.masked.load.nxv8f32 -; CHECK-ORDERED-TF: [[WIDE_LOAD4:%.*]] = call @llvm.masked.load.nxv8f32 -; CHECK-ORDERED-TF: [[WIDE_LOAD5:%.*]] = call @llvm.masked.load.nxv8f32 -; CHECK-ORDERED-TF: [[WIDE_LOAD6:%.*]] = call @llvm.masked.load.nxv8f32 -; CHECK-ORDERED-TF: [[WIDE_LOAD7:%.*]] = call @llvm.masked.load.nxv8f32 -; CHECK-ORDERED-TF: [[FMUL:%.*]] = fmul nnan [[WIDE_LOAD]], [[WIDE_LOAD4]] -; CHECK-ORDERED-TF: [[FMUL1:%.*]] = fmul nnan [[WIDE_LOAD1]], [[WIDE_LOAD5]] -; CHECK-ORDERED-TF: [[FMUL2:%.*]] = fmul nnan [[WIDE_LOAD2]], [[WIDE_LOAD6]] -; CHECK-ORDERED-TF: [[FMUL3:%.*]] = fmul nnan [[WIDE_LOAD3]], [[WIDE_LOAD7]] -; CHECK-ORDERED-TF: [[SEL:%.*]] = select nnan [[ACTIVE_LANE_MASK]], [[FMUL]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[SEL]]) -; CHECK-ORDERED-TF: [[SEL1:%.*]] = select nnan [[ACTIVE_LANE_MASK1]], [[FMUL1]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF: [[RDX1:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX]], [[SEL1]]) -; CHECK-ORDERED-TF: [[SEL2:%.*]] = select nnan [[ACTIVE_LANE_MASK2]], [[FMUL2]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF: [[RDX2:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX1]], [[SEL2]]) -; CHECK-ORDERED-TF: [[SEL3:%.*]] = select nnan [[ACTIVE_LANE_MASK3]], [[FMUL3]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF: [[RDX3]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX2]], [[SEL3]]) -; CHECK-ORDERED-TF: for.end -; CHECK-ORDERED-TF: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ] -; CHECK-ORDERED-TF: ret float [[RES]] - -; CHECK-UNORDERED-LABEL: @fmuladd_strict_fmf -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ] -; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load , * -; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load , * -; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load , * -; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load , * -; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load , * -; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load , * -; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load , * -; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load , * -; CHECK-UNORDERED: [[FMULADD]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD]], [[WIDE_LOAD4]], [[VEC_PHI]]) -; CHECK-UNORDERED: [[FMULADD1]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD1]], [[WIDE_LOAD5]], [[VEC_PHI1]]) -; CHECK-UNORDERED: [[FMULADD2]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD2]], [[WIDE_LOAD6]], [[VEC_PHI2]]) -; CHECK-UNORDERED: [[FMULADD3]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD3]], [[WIDE_LOAD7]], [[VEC_PHI3]]) -; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd nnan [[FMULADD1]], [[FMULADD]] -; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan [[FMULADD2]], [[BIN_RDX]] -; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan [[FMULADD3]], [[BIN_RDX1]] -; CHECK-UNORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[BIN_RDX2]] -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[SCALAR:%.*]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ] -; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float* -; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, float* -; CHECK-UNORDERED: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]]) -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float [[RES]] - -; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict_fmf -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fmuladd_strict_fmf +; CHECK-NOT-VECTORIZED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]]) +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fmuladd_strict_fmf +; CHECK-UNORDERED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32 +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP56:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP57:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP58:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP59:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]] +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24 +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] +; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]] +; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]] +; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]] +; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]] +; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP25]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 8 +; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[TMP27]] +; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , * [[TMP29]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 16 +; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[TMP31]] +; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , * [[TMP33]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 24 +; CHECK-UNORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[TMP35]] +; CHECK-UNORDERED-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , * [[TMP37]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP4]] +; CHECK-UNORDERED-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]] +; CHECK-UNORDERED-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP14]] +; CHECK-UNORDERED-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP19]] +; CHECK-UNORDERED-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP43:%.*]] = bitcast float* [[TMP42]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , * [[TMP43]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 8 +; CHECK-UNORDERED-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 [[TMP45]] +; CHECK-UNORDERED-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD8:%.*]] = load , * [[TMP47]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP48:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP49:%.*]] = mul i64 [[TMP48]], 16 +; CHECK-UNORDERED-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 [[TMP49]] +; CHECK-UNORDERED-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD9:%.*]] = load , * [[TMP51]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 24 +; CHECK-UNORDERED-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 [[TMP53]] +; CHECK-UNORDERED-NEXT: [[TMP55:%.*]] = bitcast float* [[TMP54]] to * +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD10:%.*]] = load , * [[TMP55]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP56]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD]], [[WIDE_LOAD7]], [[VEC_PHI]]) +; CHECK-UNORDERED-NEXT: [[TMP57]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD4]], [[WIDE_LOAD8]], [[VEC_PHI1]]) +; CHECK-UNORDERED-NEXT: [[TMP58]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD5]], [[WIDE_LOAD9]], [[VEC_PHI2]]) +; CHECK-UNORDERED-NEXT: [[TMP59]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD6]], [[WIDE_LOAD10]], [[VEC_PHI3]]) +; CHECK-UNORDERED-NEXT: [[TMP60:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP61:%.*]] = mul i64 [[TMP60]], 32 +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP61]] +; CHECK-UNORDERED-NEXT: [[TMP62:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP62]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd nnan [[TMP57]], [[TMP56]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX11:%.*]] = fadd nnan [[TMP58]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX12:%.*]] = fadd nnan [[TMP59]], [[BIN_RDX11]] +; CHECK-UNORDERED-NEXT: [[TMP63:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[BIN_RDX12]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP64:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP65:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP64]], float [[TMP65]], float [[SUM_07]]) +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[MULADD_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fmuladd_strict_fmf +; CHECK-ORDERED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32 +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP63:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]] +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24 +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 +; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 +; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] +; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]] +; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]] +; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]] +; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]] +; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP25]], align 4 +; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 8 +; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[TMP27]] +; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , * [[TMP29]], align 4 +; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 16 +; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[TMP31]] +; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load , * [[TMP33]], align 4 +; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 24 +; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[TMP35]] +; CHECK-ORDERED-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load , * [[TMP37]], align 4 +; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP4]] +; CHECK-ORDERED-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]] +; CHECK-ORDERED-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP14]] +; CHECK-ORDERED-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP19]] +; CHECK-ORDERED-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP43:%.*]] = bitcast float* [[TMP42]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , * [[TMP43]], align 4 +; CHECK-ORDERED-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 8 +; CHECK-ORDERED-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 [[TMP45]] +; CHECK-ORDERED-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , * [[TMP47]], align 4 +; CHECK-ORDERED-NEXT: [[TMP48:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP49:%.*]] = mul i64 [[TMP48]], 16 +; CHECK-ORDERED-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 [[TMP49]] +; CHECK-ORDERED-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , * [[TMP51]], align 4 +; CHECK-ORDERED-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 24 +; CHECK-ORDERED-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 [[TMP53]] +; CHECK-ORDERED-NEXT: [[TMP55:%.*]] = bitcast float* [[TMP54]] to * +; CHECK-ORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , * [[TMP55]], align 4 +; CHECK-ORDERED-NEXT: [[TMP56:%.*]] = fmul nnan [[WIDE_LOAD]], [[WIDE_LOAD4]] +; CHECK-ORDERED-NEXT: [[TMP57:%.*]] = fmul nnan [[WIDE_LOAD1]], [[WIDE_LOAD5]] +; CHECK-ORDERED-NEXT: [[TMP58:%.*]] = fmul nnan [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; CHECK-ORDERED-NEXT: [[TMP59:%.*]] = fmul nnan [[WIDE_LOAD3]], [[WIDE_LOAD7]] +; CHECK-ORDERED-NEXT: [[TMP60:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP56]]) +; CHECK-ORDERED-NEXT: [[TMP61:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP60]], [[TMP57]]) +; CHECK-ORDERED-NEXT: [[TMP62:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP61]], [[TMP58]]) +; CHECK-ORDERED-NEXT: [[TMP63]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP62]], [[TMP59]]) +; CHECK-ORDERED-NEXT: [[TMP64:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP65:%.*]] = mul i64 [[TMP64]], 32 +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP65]] +; CHECK-ORDERED-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP66]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP67:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP68:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP67]], float [[TMP68]], float [[SUM_07]]) +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[MULADD_LCSSA]] +; +; CHECK-ORDERED-TF-LABEL: define float @fmuladd_strict_fmf +; CHECK-ORDERED-TF-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-ORDERED-TF-NEXT: entry: +; CHECK-ORDERED-TF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED-TF: vector.ph: +; CHECK-ORDERED-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-ORDERED-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]] +; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP8]] +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 24 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP10]] +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = sub i64 [[N]], [[TMP12]] +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[N]], [[TMP12]] +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = sub i64 [[N]], [[TMP17]] +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[N]], [[TMP17]] +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = sub i64 [[N]], [[TMP22]] +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = icmp ugt i64 [[N]], [[TMP22]] +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = sub i64 [[N]], [[TMP27]] +; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = icmp ugt i64 [[N]], [[TMP27]] +; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i64 [[TMP28]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED-TF: vector.body: +; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP94:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = add i64 [[INDEX]], [[TMP35]] +; CHECK-ORDERED-TF-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = add i64 [[TMP38]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], [[TMP40]] +; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = add i64 [[TMP43]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = add i64 [[INDEX]], [[TMP45]] +; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP31]] +; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP36]] +; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP41]] +; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP46]] +; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, float* [[TMP47]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = bitcast float* [[TMP51]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* [[TMP52]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = mul i64 [[TMP53]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = getelementptr inbounds float, float* [[TMP47]], i64 [[TMP54]] +; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = bitcast float* [[TMP55]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* [[TMP56]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = mul i64 [[TMP57]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[TMP47]], i64 [[TMP58]] +; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = bitcast float* [[TMP59]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* [[TMP60]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = mul i64 [[TMP61]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, float* [[TMP47]], i64 [[TMP62]] +; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = bitcast float* [[TMP63]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* [[TMP64]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP31]] +; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP36]] +; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP41]] +; CHECK-ORDERED-TF-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP46]] +; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[TMP65]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = bitcast float* [[TMP69]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* [[TMP70]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = mul i64 [[TMP71]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = getelementptr inbounds float, float* [[TMP65]], i64 [[TMP72]] +; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = bitcast float* [[TMP73]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* [[TMP74]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = mul i64 [[TMP75]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = getelementptr inbounds float, float* [[TMP65]], i64 [[TMP76]] +; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = bitcast float* [[TMP77]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* [[TMP78]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP80:%.*]] = mul i64 [[TMP79]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = getelementptr inbounds float, float* [[TMP65]], i64 [[TMP80]] +; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = bitcast float* [[TMP81]] to * +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* [[TMP82]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP83:%.*]] = fmul nnan [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]] +; CHECK-ORDERED-TF-NEXT: [[TMP84:%.*]] = fmul nnan [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]] +; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = fmul nnan [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]] +; CHECK-ORDERED-TF-NEXT: [[TMP86:%.*]] = fmul nnan [[WIDE_MASKED_LOAD11]], [[WIDE_MASKED_LOAD15]] +; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = select nnan [[ACTIVE_LANE_MASK]], [[TMP83]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP87]]) +; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = select nnan [[ACTIVE_LANE_MASK6]], [[TMP84]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP90:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP88]], [[TMP89]]) +; CHECK-ORDERED-TF-NEXT: [[TMP91:%.*]] = select nnan [[ACTIVE_LANE_MASK7]], [[TMP85]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP92:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP90]], [[TMP91]]) +; CHECK-ORDERED-TF-NEXT: [[TMP93:%.*]] = select nnan [[ACTIVE_LANE_MASK8]], [[TMP86]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP94]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP92]], [[TMP93]]) +; CHECK-ORDERED-TF-NEXT: [[TMP95:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP96:%.*]] = mul i64 [[TMP95]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP97:%.*]] = add i64 [[INDEX]], [[TMP96]] +; CHECK-ORDERED-TF-NEXT: [[TMP98:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP99:%.*]] = mul i64 [[TMP98]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP100:%.*]] = add i64 [[INDEX]], [[TMP99]] +; CHECK-ORDERED-TF-NEXT: [[TMP101:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP102:%.*]] = mul i64 [[TMP101]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP103:%.*]] = add i64 [[INDEX]], [[TMP102]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP97]], i64 [[TMP20]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP100]], i64 [[TMP25]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP103]], i64 [[TMP30]]) +; CHECK-ORDERED-TF-NEXT: [[TMP104:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP105:%.*]] = mul i64 [[TMP104]], 32 +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP105]] +; CHECK-ORDERED-TF-NEXT: [[TMP106:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP107:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP108:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP109:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP110:%.*]] = extractelement [[TMP106]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP110]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-ORDERED-TF: middle.block: +; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED-TF: scalar.ph: +; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP94]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED-TF: for.body: +; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-TF-NEXT: [[TMP111:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-TF-NEXT: [[TMP112:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-TF-NEXT: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP111]], float [[TMP112]], float [[SUM_07]]) +; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-ORDERED-TF: for.end: +; CHECK-ORDERED-TF-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP94]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: ret float [[MULADD_LCSSA]] +; + + + entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -mtriple aarch64-unknown-linux-gnu -force-ordered-reductions=false -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -mtriple aarch64-unknown-linux-gnu -force-ordered-reductions=false -hints-allow-reordering=true -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -mtriple aarch64-unknown-linux-gnu -force-ordered-reductions=true -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED @@ -5,32 +6,108 @@ ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -mtriple aarch64-unknown-linux-gnu -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_strict -; CHECK-ORDERED: vector.body: -; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ] -; CHECK-ORDERED: %[[LOAD:.*]] = load <8 x float>, <8 x float>* -; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI]], <8 x float> %[[LOAD]]) -; CHECK-ORDERED: for.end -; CHECK-ORDERED: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-ORDERED: ret float %[[PHI]] - -; CHECK-UNORDERED-LABEL: @fadd_strict -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[FADD_VEC:.*]], %vector.body ] -; CHECK-UNORDERED: %[[LOAD_VEC:.*]] = load <8 x float>, <8 x float>* -; CHECK-UNORDERED: %[[FADD_VEC]] = fadd <8 x float> %[[LOAD_VEC]], %[[VEC_PHI]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[FADD_VEC]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[LOAD:.*]] = load float, float* -; CHECK-UNORDERED: %[[FADD:.*]] = fadd float %[[LOAD]], {{.*}} -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float %[[RES]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_strict +; CHECK-NOT-VECTORIZED-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd float [[TMP0]], [[SUM_07]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_strict +; CHECK-UNORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP4]] = fadd <8 x float> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP4]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD]] = fadd float [[TMP7]], [[SUM_07]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fadd_strict +; CHECK-ORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4 +; CHECK-ORDERED-NEXT: [[TMP4]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[WIDE_LOAD]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD]] = fadd float [[TMP6]], [[SUM_07]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]] +; + + entry: br label %for.body @@ -51,32 +128,108 @@ ; Same as above but where fadd has a fast-math flag. define float @fadd_strict_fmf(float* noalias nocapture readonly %a, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_strict_fmf -; CHECK-ORDERED: vector.body: -; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX:%.*]], %vector.body ] -; CHECK-ORDERED: [[LOAD_VEC:%.*]] = load <8 x float>, <8 x float>* -; CHECK-ORDERED: [[RDX]] = call nnan float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[LOAD_VEC]]) -; CHECK-ORDERED: for.end: -; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX]], %middle.block ] -; CHECK-ORDERED: ret float [[RES]] - -; CHECK-UNORDERED-LABEL: @fadd_strict_fmf -; CHECK-UNORDERED: vector.body: -; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <8 x float> [ , %vector.ph ], [ [[FADD_VEC:%.*]], %vector.body ] -; CHECK-UNORDERED: [[LOAD_VEC:%.*]] = load <8 x float>, <8 x float>* -; CHECK-UNORDERED: [[FADD_VEC]] = fadd nnan <8 x float> [[LOAD_VEC]], [[VEC_PHI]] -; CHECK-UNORDERED-NOT: @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[FADD_VEC]]) -; CHECK-UNORDERED: for.body: -; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float* -; CHECK-UNORDERED: [[FADD:%.*]] = fadd nnan float [[LOAD]], {{.*}} -; CHECK-UNORDERED: for.end: -; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[FADD]], %for.body ], [ [[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float [[RES]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_fmf -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_strict_fmf +; CHECK-NOT-VECTORIZED-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd nnan float [[TMP0]], [[SUM_07]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_strict_fmf +; CHECK-UNORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP4]] = fadd nnan <8 x float> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = call nnan float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP4]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD]] = fadd nnan float [[TMP7]], [[SUM_07]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fadd_strict_fmf +; CHECK-ORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4 +; CHECK-ORDERED-NEXT: [[TMP4]] = call nnan float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[WIDE_LOAD]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD]] = fadd nnan float [[TMP6]], [[SUM_07]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]] +; + + entry: br label %for.body @@ -96,51 +249,150 @@ } define float @fadd_strict_unroll(float* noalias nocapture readonly %a, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_strict_unroll -; CHECK-ORDERED: vector.body: -; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ] -; CHECK-ORDERED-NOT: phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ] -; CHECK-ORDERED: %[[LOAD1:.*]] = load <8 x float>, <8 x float>* -; CHECK-ORDERED: %[[LOAD2:.*]] = load <8 x float>, <8 x float>* -; CHECK-ORDERED: %[[LOAD3:.*]] = load <8 x float>, <8 x float>* -; CHECK-ORDERED: %[[LOAD4:.*]] = load <8 x float>, <8 x float>* -; CHECK-ORDERED: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI1]], <8 x float> %[[LOAD1]]) -; CHECK-ORDERED: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX1]], <8 x float> %[[LOAD2]]) -; CHECK-ORDERED: %[[RDX3:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX2]], <8 x float> %[[LOAD3]]) -; CHECK-ORDERED: %[[RDX4]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX3]], <8 x float> %[[LOAD4]]) -; CHECK-ORDERED: for.end -; CHECK-ORDERED: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX4]], %middle.block ] -; CHECK-ORDERED: ret float %[[PHI]] - -; CHECK-UNORDERED-LABEL: @fadd_strict_unroll -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[VEC_FADD1:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI3:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[VEC_FADD3:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI4:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[VEC_FADD4:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <8 x float>, <8 x float>* -; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <8 x float>, <8 x float>* -; CHECK-UNORDERED: %[[VEC_LOAD3:.*]] = load <8 x float>, <8 x float>* -; CHECK-UNORDERED: %[[VEC_LOAD4:.*]] = load <8 x float>, <8 x float>* -; CHECK-UNORDERED: %[[VEC_FADD1]] = fadd <8 x float> %[[VEC_LOAD1]], %[[VEC_PHI1]] -; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <8 x float> %[[VEC_LOAD2]], %[[VEC_PHI2]] -; CHECK-UNORDERED: %[[VEC_FADD3]] = fadd <8 x float> %[[VEC_LOAD3]], %[[VEC_PHI3]] -; CHECK-UNORDERED: %[[VEC_FADD4]] = fadd <8 x float> %[[VEC_LOAD4]], %[[VEC_PHI4]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd <8 x float> %[[VEC_FADD2]], %[[VEC_FADD1]] -; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd <8 x float> %[[VEC_FADD3]], %[[BIN_RDX1]] -; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd <8 x float> %[[VEC_FADD4]], %[[BIN_RDX2]] -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[BIN_RDX3]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[LOAD:.*]] = load float, float* -; CHECK-UNORDERED: %[[FADD:.*]] = fadd float %[[LOAD]], {{.*}} -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float %[[RES]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_unroll -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_strict_unroll +; CHECK-NOT-VECTORIZED-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd float [[TMP0]], [[SUM_07]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_strict_unroll +; CHECK-UNORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16 +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24 +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP16]] = fadd <8 x float> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[TMP17]] = fadd <8 x float> [[WIDE_LOAD4]], [[VEC_PHI1]] +; CHECK-UNORDERED-NEXT: [[TMP18]] = fadd <8 x float> [[WIDE_LOAD5]], [[VEC_PHI2]] +; CHECK-UNORDERED-NEXT: [[TMP19]] = fadd <8 x float> [[WIDE_LOAD6]], [[VEC_PHI3]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd <8 x float> [[TMP17]], [[TMP16]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX7:%.*]] = fadd <8 x float> [[TMP18]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX8:%.*]] = fadd <8 x float> [[TMP19]], [[BIN_RDX7]] +; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX8]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD]] = fadd float [[TMP22]], [[SUM_07]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fadd_strict_unroll +; CHECK-ORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]] +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8 +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4 +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16 +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4 +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24 +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4 +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[WIDE_LOAD]]) +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP16]], <8 x float> [[WIDE_LOAD1]]) +; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP17]], <8 x float> [[WIDE_LOAD2]]) +; CHECK-ORDERED-NEXT: [[TMP19]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP18]], <8 x float> [[WIDE_LOAD3]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD]] = fadd float [[TMP21]], [[SUM_07]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]] +; + + entry: br label %for.body @@ -168,63 +420,174 @@ ; return sum; define float @fadd_strict_unroll_last_val(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_strict_unroll_last_val -; CHECK-ORDERED: vector.body -; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ] -; CHECK-ORDERED-NOT: phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ] -; CHECK-ORDERED: %[[LOAD1:.*]] = load <8 x float>, <8 x float>* -; CHECK-ORDERED: %[[LOAD2:.*]] = load <8 x float>, <8 x float>* -; CHECK-ORDERED: %[[LOAD3:.*]] = load <8 x float>, <8 x float>* -; CHECK-ORDERED: %[[LOAD4:.*]] = load <8 x float>, <8 x float>* -; CHECK-ORDERED: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI1]], <8 x float> %[[LOAD1]]) -; CHECK-ORDERED: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX1]], <8 x float> %[[LOAD2]]) -; CHECK-ORDERED: %[[RDX3:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX2]], <8 x float> %[[LOAD3]]) -; CHECK-ORDERED: %[[RDX4]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX3]], <8 x float> %[[LOAD4]]) -; CHECK-ORDERED: for.body -; CHECK-ORDERED: %[[SUM_PHI:.*]] = phi float [ %[[FADD:.*]], %for.body ], [ {{.*}}, %scalar.ph ] -; CHECK-ORDERED: %[[LOAD5:.*]] = load float, float* -; CHECK-ORDERED: %[[FADD]] = fadd float %[[SUM_PHI]], %[[LOAD5]] -; CHECK-ORDERED: for.cond.cleanup -; CHECK-ORDERED: %[[FADD_LCSSA:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX4]], %middle.block ] -; CHECK-ORDERED: %[[FADD_42:.*]] = fadd float %[[FADD_LCSSA]], 4.200000e+01 -; CHECK-ORDERED: store float %[[FADD_42]], float* %b -; CHECK-ORDERED: for.end -; CHECK-ORDERED: %[[SUM_LCSSA:.*]] = phi float [ %[[FADD_LCSSA]], %for.cond.cleanup ], [ 0.000000e+00, %entry ] -; CHECK-ORDERED: ret float %[[SUM_LCSSA]] - -; CHECK-UNORDERED-LABEL: @fadd_strict_unroll_last_val -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[VEC_FADD1:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI3:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[VEC_FADD3:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI4:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[VEC_FADD4:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <8 x float>, <8 x float>* -; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <8 x float>, <8 x float>* -; CHECK-UNORDERED: %[[VEC_LOAD3:.*]] = load <8 x float>, <8 x float>* -; CHECK-UNORDERED: %[[VEC_LOAD4:.*]] = load <8 x float>, <8 x float>* -; CHECK-UNORDERED: %[[VEC_FADD1]] = fadd <8 x float> %[[VEC_PHI1]], %[[VEC_LOAD1]] -; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <8 x float> %[[VEC_PHI2]], %[[VEC_LOAD2]] -; CHECK-UNORDERED: %[[VEC_FADD3]] = fadd <8 x float> %[[VEC_PHI3]], %[[VEC_LOAD3]] -; CHECK-UNORDERED: %[[VEC_FADD4]] = fadd <8 x float> %[[VEC_PHI4]], %[[VEC_LOAD4]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd <8 x float> %[[VEC_FADD2]], %[[VEC_FADD1]] -; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd <8 x float> %[[VEC_FADD3]], %[[BIN_RDX1]] -; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd <8 x float> %[[VEC_FADD4]], %[[BIN_RDX2]] -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[BIN_RDX3]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[LOAD:.*]] = load float, float* -; CHECK-UNORDERED: %[[FADD:.*]] = fadd float {{.*}}, %[[LOAD]] -; CHECK-UNORDERED: for.cond.cleanup -; CHECK-UNORDERED: %[[FADD_LCSSA:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-UNORDERED: %[[FADD_42:.*]] = fadd float %[[FADD_LCSSA]], 4.200000e+01 -; CHECK-UNORDERED: store float %[[FADD_42]], float* %b -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[SUM_LCSSA:.*]] = phi float [ %[[FADD_LCSSA]], %for.cond.cleanup ], [ 0.000000e+00, %entry ] -; CHECK-UNORDERED: ret float %[[SUM_LCSSA]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_unroll_last_val -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_strict_unroll_last_val +; CHECK-NOT-VECTORIZED-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: [[CMP:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK-NOT-VECTORIZED: for.body.preheader: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM:%.*]] = phi float [ [[FADD:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[FADD]] = fadd float [[SUM]], [[TMP0]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; CHECK-NOT-VECTORIZED: for.cond.cleanup: +; CHECK-NOT-VECTORIZED-NEXT: [[FADD_LCSSA:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[FADD2:%.*]] = fadd float [[FADD_LCSSA]], 4.200000e+01 +; CHECK-NOT-VECTORIZED-NEXT: store float [[FADD2]], float* [[B]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_END]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_LCSSA:%.*]] = phi float [ [[FADD_LCSSA]], [[FOR_COND_CLEANUP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[SUM_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_strict_unroll_last_val +; CHECK-UNORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[CMP:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-UNORDERED-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK-UNORDERED: for.body.preheader: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16 +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24 +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP16]] = fadd <8 x float> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-UNORDERED-NEXT: [[TMP17]] = fadd <8 x float> [[VEC_PHI1]], [[WIDE_LOAD4]] +; CHECK-UNORDERED-NEXT: [[TMP18]] = fadd <8 x float> [[VEC_PHI2]], [[WIDE_LOAD5]] +; CHECK-UNORDERED-NEXT: [[TMP19]] = fadd <8 x float> [[VEC_PHI3]], [[WIDE_LOAD6]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd <8 x float> [[TMP17]], [[TMP16]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX7:%.*]] = fadd <8 x float> [[TMP18]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX8:%.*]] = fadd <8 x float> [[TMP19]], [[BIN_RDX7]] +; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX8]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-UNORDERED-NEXT: [[SUM:%.*]] = phi float [ [[FADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[FADD]] = fadd float [[SUM]], [[TMP22]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-UNORDERED: for.cond.cleanup: +; CHECK-UNORDERED-NEXT: [[FADD_LCSSA:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: [[FADD2:%.*]] = fadd float [[FADD_LCSSA]], 4.200000e+01 +; CHECK-UNORDERED-NEXT: store float [[FADD2]], float* [[B]], align 4 +; CHECK-UNORDERED-NEXT: br label [[FOR_END]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[SUM_LCSSA:%.*]] = phi float [ [[FADD_LCSSA]], [[FOR_COND_CLEANUP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: ret float [[SUM_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fadd_strict_unroll_last_val +; CHECK-ORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[CMP:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-ORDERED-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK-ORDERED: for.body.preheader: +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]] +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8 +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4 +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16 +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4 +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24 +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4 +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[WIDE_LOAD]]) +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP16]], <8 x float> [[WIDE_LOAD1]]) +; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP17]], <8 x float> [[WIDE_LOAD2]]) +; CHECK-ORDERED-NEXT: [[TMP19]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP18]], <8 x float> [[WIDE_LOAD3]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-ORDERED-NEXT: [[SUM:%.*]] = phi float [ [[FADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[FADD]] = fadd float [[SUM]], [[TMP21]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-ORDERED: for.cond.cleanup: +; CHECK-ORDERED-NEXT: [[FADD_LCSSA:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[FADD2:%.*]] = fadd float [[FADD_LCSSA]], 4.200000e+01 +; CHECK-ORDERED-NEXT: store float [[FADD2]], float* [[B]], align 4 +; CHECK-ORDERED-NEXT: br label [[FOR_END]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[SUM_LCSSA:%.*]] = phi float [ [[FADD_LCSSA]], [[FOR_COND_CLEANUP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: ret float [[SUM_LCSSA]] +; + + entry: %cmp = icmp sgt i64 %n, 0 @@ -252,55 +615,172 @@ } define void @fadd_strict_interleave(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_strict_interleave -; CHECK-ORDERED: entry -; CHECK-ORDERED: %[[ARRAYIDX:.*]] = getelementptr inbounds float, float* %a, i64 1 -; CHECK-ORDERED: %[[LOAD1:.*]] = load float, float* %a -; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float* %[[ARRAYIDX]] -; CHECK-ORDERED: vector.body -; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ %[[LOAD2]], %vector.ph ], [ %[[RDX2:.*]], %vector.body ] -; CHECK-ORDERED: %[[VEC_PHI2:.*]] = phi float [ %[[LOAD1]], %vector.ph ], [ %[[RDX1:.*]], %vector.body ] -; CHECK-ORDERED: %[[WIDE_LOAD:.*]] = load <8 x float>, <8 x float>* -; CHECK-ORDERED: %[[STRIDED1:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> -; CHECK-ORDERED: %[[STRIDED2:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> -; CHECK-ORDERED: %[[RDX2]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI1]], <4 x float> %[[STRIDED2]]) -; CHECK-ORDERED: %[[RDX1]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI2]], <4 x float> %[[STRIDED1]]) -; CHECK-ORDERED: for.end -; CHECK-ORDERED: ret void - -; CHECK-UNORDERED-LABEL: @fadd_strict_interleave -; CHECK-UNORDERED: %[[ARRAYIDX:.*]] = getelementptr inbounds float, float* %a, i64 1 -; CHECK-UNORDERED: %[[LOADA1:.*]] = load float, float* %a -; CHECK-UNORDERED: %[[LOADA2:.*]] = load float, float* %[[ARRAYIDX]] -; CHECK-UNORDERED: vector.ph -; CHECK-UNORDERED: %[[INS2:.*]] = insertelement <4 x float> , float %[[LOADA2]], i32 0 -; CHECK-UNORDERED: %[[INS1:.*]] = insertelement <4 x float> , float %[[LOADA1]], i32 0 -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi <4 x float> [ %[[INS2]], %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi <4 x float> [ %[[INS1]], %vector.ph ], [ %[[VEC_FADD1:.*]], %vector.body ] -; CHECK-UNORDERED: %[[WIDE_LOAD:.*]] = load <8 x float>, <8 x float>* -; CHECK-UNORDERED: %[[STRIDED1:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> -; CHECK-UNORDERED: %[[STRIDED2:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> -; CHECK-UNORDERED: %[[VEC_FADD1]] = fadd <4 x float> %[[STRIDED1:.*]], %[[VEC_PHI1]] -; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <4 x float> %[[STRIDED2:.*]], %[[VEC_PHI2]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD1]]) -; CHECK-UNORDERED: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD2]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float* -; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %[[LOAD1]], {{.*}} -; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float* -; CHECK-UNORDERED: %[[FADD2:.*]] = fadd float %[[LOAD2]], {{.*}} -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[SUM1:.*]] = phi float [ %[[FADD1]], %for.body ], [ %[[RDX1]], %middle.block ] -; CHECK-UNORDERED: %[[SUM2:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX2]], %middle.block ] -; CHECK-UNORDERED: store float %[[SUM1]] -; CHECK-UNORDERED: store float %[[SUM2]] -; CHECK-UNORDERED: ret void - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_interleave -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define void @fadd_strict_interleave +; CHECK-NOT-VECTORIZED-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 +; CHECK-NOT-VECTORIZED-NEXT: [[A1:%.*]] = load float, float* [[A]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[A2:%.*]] = load float, float* [[ARRAYIDXA]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[A2]], [[ENTRY:%.*]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ADD_PHI2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDXB1]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD1]] = fadd float [[TMP0]], [[ADD_PHI2]] +; CHECK-NOT-VECTORIZED-NEXT: [[OR:%.*]] = or i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[OR]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDXB2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD2]] = fadd float [[TMP1]], [[ADD_PHI1]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: store float [[ADD1_LCSSA]], float* [[A]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: store float [[ADD2_LCSSA]], float* [[ARRAYIDXA]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: ret void +; +; CHECK-UNORDERED-LABEL: define void @fadd_strict_interleave +; CHECK-UNORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 +; CHECK-UNORDERED-NEXT: [[A1:%.*]] = load float, float* [[A]], align 4 +; CHECK-UNORDERED-NEXT: [[A2:%.*]] = load float, float* [[ARRAYIDXA]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[N]], -2 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1 +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = insertelement <4 x float> , float [[A2]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = insertelement <4 x float> , float [[A1]], i32 0 +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP5]] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, <8 x float>* [[TMP8]], align 4 +; CHECK-UNORDERED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = insertvalue [[TMP0]] poison, <4 x float> [[STRIDED_VEC]], 0 +; CHECK-UNORDERED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = insertvalue [[TMP0]] [[TMP9]], <4 x float> [[STRIDED_VEC2]], 1 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = extractvalue [[TMP0]] [[TMP10]], 0 +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = extractvalue [[TMP0]] [[TMP10]], 1 +; CHECK-UNORDERED-NEXT: [[TMP13]] = fadd <4 x float> [[TMP11]], [[VEC_PHI1]] +; CHECK-UNORDERED-NEXT: [[TMP14]] = fadd <4 x float> [[TMP12]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP13]]) +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP14]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX3:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ADD_PHI2:%.*]] = phi float [ [[BC_MERGE_RDX3]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDXB1]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD1]] = fadd float [[TMP18]], [[ADD_PHI2]] +; CHECK-UNORDERED-NEXT: [[OR:%.*]] = or i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[OR]] +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = load float, float* [[ARRAYIDXB2]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD2]] = fadd float [[TMP19]], [[ADD_PHI1]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: store float [[ADD1_LCSSA]], float* [[A]], align 4 +; CHECK-UNORDERED-NEXT: store float [[ADD2_LCSSA]], float* [[ARRAYIDXA]], align 4 +; CHECK-UNORDERED-NEXT: ret void +; +; CHECK-ORDERED-LABEL: define void @fadd_strict_interleave +; CHECK-ORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 +; CHECK-ORDERED-NEXT: [[A1:%.*]] = load float, float* [[A]], align 4 +; CHECK-ORDERED-NEXT: [[A2:%.*]] = load float, float* [[ARRAYIDXA]], align 4 +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[N]], -2 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1 +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2 +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]] +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4 +; CHECK-ORDERED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = insertvalue [[TMP0]] poison, <4 x float> [[STRIDED_VEC]], 0 +; CHECK-ORDERED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = insertvalue [[TMP0]] [[TMP7]], <4 x float> [[STRIDED_VEC2]], 1 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = extractvalue [[TMP0]] [[TMP8]], 0 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = extractvalue [[TMP0]] [[TMP8]], 1 +; CHECK-ORDERED-NEXT: [[TMP11]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[TMP10]]) +; CHECK-ORDERED-NEXT: [[TMP12]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI1]], <4 x float> [[TMP9]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX3:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ADD_PHI2:%.*]] = phi float [ [[BC_MERGE_RDX3]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = load float, float* [[ARRAYIDXB1]], align 4 +; CHECK-ORDERED-NEXT: [[ADD1]] = fadd float [[TMP14]], [[ADD_PHI2]] +; CHECK-ORDERED-NEXT: [[OR:%.*]] = or i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[OR]] +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = load float, float* [[ARRAYIDXB2]], align 4 +; CHECK-ORDERED-NEXT: [[ADD2]] = fadd float [[TMP15]], [[ADD_PHI1]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: store float [[ADD1_LCSSA]], float* [[A]], align 4 +; CHECK-ORDERED-NEXT: store float [[ADD2_LCSSA]], float* [[ARRAYIDXA]], align 4 +; CHECK-ORDERED-NEXT: ret void +; + + entry: %arrayidxa = getelementptr inbounds float, float* %a, i64 1 @@ -330,42 +810,151 @@ } define float @fadd_of_sum(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_of_sum -; CHECK-ORDERED: vector.body -; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ] -; CHECK-ORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: %[[LOAD2:.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: %[[ADD:.*]] = fadd <4 x float> %[[LOAD1]], %[[LOAD2]] -; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI1]], <4 x float> %[[ADD]]) -; CHECK-ORDERED: for.end.loopexit -; CHECK-ORDERED: %[[EXIT_PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-ORDERED: for.end -; CHECK-ORDERED: %[[PHI:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT_PHI]], %for.end.loopexit ] -; CHECK-ORDERED: ret float %[[PHI]] - -; CHECK-UNORDERED-LABEL: @fadd_of_sum -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi <4 x float> [ , %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: %[[VEC_FADD1:.*]] = fadd <4 x float> %[[VEC_LOAD1]], %[[VEC_LOAD2]] -; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <4 x float> %[[VEC_PHI]], %[[VEC_FADD1]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD2]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float* -; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float* -; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %[[LOAD1]], %[[LOAD2]] -; CHECK-UNORDERED: %[[FADD2:.*]] = fadd float {{.*}}, %[[FADD1]] -; CHECK-UNORDERED: for.end.loopexit -; CHECK-UNORDERED: %[[EXIT:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT]], %for.end.loopexit ] -; CHECK-UNORDERED: ret float %[[SUM]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_of_sum -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_of_sum +; CHECK-NOT-VECTORIZED-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 5.000000e-01 +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK-NOT-VECTORIZED: for.body.preheader: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD:%.*]] = fadd float [[TMP1]], [[TMP2]] +; CHECK-NOT-VECTORIZED-NEXT: [[RDX]] = fadd float [[RES_014]], [[ADD]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6]] +; CHECK-NOT-VECTORIZED: for.end.loopexit: +; CHECK-NOT-VECTORIZED-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_END]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[RES]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_of_sum +; CHECK-UNORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 5.000000e-01 +; CHECK-UNORDERED-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK-UNORDERED: for.body.preheader: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-UNORDERED-NEXT: [[TMP9]] = fadd <4 x float> [[VEC_PHI]], [[TMP8]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP9]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-UNORDERED-NEXT: [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX4]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD:%.*]] = fadd float [[TMP12]], [[TMP13]] +; CHECK-UNORDERED-NEXT: [[RDX]] = fadd float [[RES_014]], [[ADD]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-UNORDERED: for.end.loopexit: +; CHECK-UNORDERED-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_END]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-UNORDERED-NEXT: ret float [[RES]] +; +; CHECK-ORDERED-LABEL: define float @fadd_of_sum +; CHECK-ORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 5.000000e-01 +; CHECK-ORDERED-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK-ORDERED: for.body.preheader: +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4 +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4 +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-ORDERED-NEXT: [[TMP9]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[TMP8]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-ORDERED-NEXT: [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX4]], align 4 +; CHECK-ORDERED-NEXT: [[ADD:%.*]] = fadd float [[TMP11]], [[TMP12]] +; CHECK-ORDERED-NEXT: [[RDX]] = fadd float [[RES_014]], [[ADD]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-ORDERED: for.end.loopexit: +; CHECK-ORDERED-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_END]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-ORDERED-NEXT: ret float [[RES]] +; + + entry: %arrayidx = getelementptr inbounds float, float* %a, i64 1 @@ -392,63 +981,216 @@ } define float @fadd_conditional(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_conditional -; CHECK-ORDERED: vector.body: -; CHECK-ORDERED: %[[PHI:.*]] = phi float [ 1.000000e+00, %vector.ph ], [ %[[RDX:.*]], %pred.load.continue6 ] -; CHECK-ORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: %[[FCMP1:.*]] = fcmp une <4 x float> %[[LOAD1]], zeroinitializer -; CHECK-ORDERED: %[[EXTRACT:.*]] = extractelement <4 x i1> %[[FCMP1]], i32 0 -; CHECK-ORDERED: br i1 %[[EXTRACT]], label %pred.load.if, label %pred.load.continue -; CHECK-ORDERED: pred.load.continue6 -; CHECK-ORDERED: %[[PHI1:.*]] = phi <4 x float> [ %[[PHI0:.*]], %pred.load.continue4 ], [ %[[INS_ELT:.*]], %pred.load.if5 ] -; CHECK-ORDERED: %[[XOR:.*]] = xor <4 x i1> %[[FCMP1]], -; CHECK-ORDERED: %[[PRED:.*]] = select <4 x i1> %[[XOR]], <4 x float> , <4 x float> %[[PHI1]] -; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[PHI]], <4 x float> %[[PRED]]) -; CHECK-ORDERED: for.body -; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[MERGE_RDX:.*]], %scalar.ph ], [ %[[FADD:.*]], %for.inc ] -; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float* -; CHECK-ORDERED: %[[FCMP2:.*]] = fcmp une float %[[LOAD2]], 0.000000e+00 -; CHECK-ORDERED: br i1 %[[FCMP2]], label %if.then, label %for.inc -; CHECK-ORDERED: if.then -; CHECK-ORDERED: %[[LOAD3:.*]] = load float, float* -; CHECK-ORDERED: br label %for.inc -; CHECK-ORDERED: for.inc -; CHECK-ORDERED: %[[PHI2:.*]] = phi float [ %[[LOAD3]], %if.then ], [ 3.000000e+00, %for.body ] -; CHECK-ORDERED: %[[FADD]] = fadd float %[[RES_PHI]], %[[PHI2]] -; CHECK-ORDERED: for.end -; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ] -; CHECK-ORDERED: ret float %[[RDX_PHI]] - -; CHECK-UNORDERED-LABEL: @fadd_conditional -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[PHI:.*]] = phi <4 x float> [ , %vector.ph ], [ %[[VEC_FADD:.*]], %pred.load.continue6 ] -; CHECK-UNORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: %[[FCMP1:.*]] = fcmp une <4 x float> %[[LOAD1]], zeroinitializer -; CHECK-UNORDERED: %[[EXTRACT:.*]] = extractelement <4 x i1> %[[FCMP1]], i32 0 -; CHECK-UNORDERED: br i1 %[[EXTRACT]], label %pred.load.if, label %pred.load.continue -; CHECK-UNORDERED: pred.load.continue6 -; CHECK-UNORDERED: %[[XOR:.*]] = xor <4 x i1> %[[FCMP1]], -; CHECK-UNORDERED: %[[PRED:.*]] = select <4 x i1> %[[XOR]], <4 x float> , <4 x float> %[[PRED_PHI:.*]] -; CHECK-UNORDERED: %[[VEC_FADD]] = fadd <4 x float> %[[PHI]], %[[PRED]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[RES_PHI:.*]] = phi float [ %[[MERGE_RDX:.*]], %scalar.ph ], [ %[[FADD:.*]], %for.inc ] -; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float* -; CHECK-UNORDERED: %[[FCMP2:.*]] = fcmp une float %[[LOAD2]], 0.000000e+00 -; CHECK-UNORDERED: br i1 %[[FCMP2]], label %if.then, label %for.inc -; CHECK-UNORDERED: if.then -; CHECK-UNORDERED: %[[LOAD3:.*]] = load float, float* -; CHECK-UNORDERED: for.inc -; CHECK-UNORDERED: %[[PHI:.*]] = phi float [ %[[LOAD3]], %if.then ], [ 3.000000e+00, %for.body ] -; CHECK-UNORDERED: %[[FADD]] = fadd float %[[RES_PHI]], %[[PHI]] -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float %[[RDX_PHI]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_conditional -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_conditional +; CHECK-NOT-VECTORIZED-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[RES:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[FADD:%.*]], [[FOR_INC]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP0]], 0.000000e+00 +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; CHECK-NOT-VECTORIZED: if.then: +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_INC]] +; CHECK-NOT-VECTORIZED: for.inc: +; CHECK-NOT-VECTORIZED-NEXT: [[PHI:%.*]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[FADD]] = fadd float [[RES]], [[PHI]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[RDX]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_conditional +; CHECK-UNORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = fcmp une <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0 +; CHECK-UNORDERED-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-UNORDERED: pred.load.if: +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i32 0 +; CHECK-UNORDERED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK-UNORDERED: pred.load.continue: +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = phi <4 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ] +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1 +; CHECK-UNORDERED-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK-UNORDERED: pred.load.if1: +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 1 +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]] +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP13]], i32 1 +; CHECK-UNORDERED-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK-UNORDERED: pred.load.continue2: +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = phi <4 x float> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ] +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2 +; CHECK-UNORDERED-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK-UNORDERED: pred.load.if3: +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 2 +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]] +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = load float, float* [[TMP18]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP19]], i32 2 +; CHECK-UNORDERED-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; CHECK-UNORDERED: pred.load.continue4: +; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = phi <4 x float> [ [[TMP15]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP20]], [[PRED_LOAD_IF3]] ] +; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3 +; CHECK-UNORDERED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]] +; CHECK-UNORDERED: pred.load.if5: +; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 3 +; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]] +; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = load float, float* [[TMP24]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP25]], i32 3 +; CHECK-UNORDERED-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; CHECK-UNORDERED: pred.load.continue6: +; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = phi <4 x float> [ [[TMP21]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP26]], [[PRED_LOAD_IF5]] ] +; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP4]], +; CHECK-UNORDERED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP28]], <4 x float> , <4 x float> [[TMP27]] +; CHECK-UNORDERED-NEXT: [[TMP29]] = fadd <4 x float> [[VEC_PHI]], [[PREDPHI]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP29]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-UNORDERED-NEXT: [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP32]], 0.000000e+00 +; CHECK-UNORDERED-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; CHECK-UNORDERED: if.then: +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: br label [[FOR_INC]] +; CHECK-UNORDERED: for.inc: +; CHECK-UNORDERED-NEXT: [[PHI:%.*]] = phi float [ [[TMP33]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[FADD]] = fadd float [[RES]], [[PHI]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[RDX]] +; +; CHECK-ORDERED-LABEL: define float @fadd_conditional +; CHECK-ORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], float* noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = fcmp une <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0 +; CHECK-ORDERED-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-ORDERED: pred.load.if: +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4 +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i32 0 +; CHECK-ORDERED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK-ORDERED: pred.load.continue: +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = phi <4 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ] +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1 +; CHECK-ORDERED-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK-ORDERED: pred.load.if1: +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 1 +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]] +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4 +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP13]], i32 1 +; CHECK-ORDERED-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK-ORDERED: pred.load.continue2: +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = phi <4 x float> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ] +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2 +; CHECK-ORDERED-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK-ORDERED: pred.load.if3: +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 2 +; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]] +; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = load float, float* [[TMP18]], align 4 +; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP19]], i32 2 +; CHECK-ORDERED-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; CHECK-ORDERED: pred.load.continue4: +; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = phi <4 x float> [ [[TMP15]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP20]], [[PRED_LOAD_IF3]] ] +; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3 +; CHECK-ORDERED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]] +; CHECK-ORDERED: pred.load.if5: +; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 3 +; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]] +; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = load float, float* [[TMP24]], align 4 +; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP25]], i32 3 +; CHECK-ORDERED-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; CHECK-ORDERED: pred.load.continue6: +; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = phi <4 x float> [ [[TMP21]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP26]], [[PRED_LOAD_IF5]] ] +; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP4]], +; CHECK-ORDERED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP28]], <4 x float> , <4 x float> [[TMP27]] +; CHECK-ORDERED-NEXT: [[TMP29]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[PREDPHI]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-ORDERED-NEXT: [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP31]], 0.000000e+00 +; CHECK-ORDERED-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; CHECK-ORDERED: if.then: +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: br label [[FOR_INC]] +; CHECK-ORDERED: for.inc: +; CHECK-ORDERED-NEXT: [[PHI:%.*]] = phi float [ [[TMP32]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[FADD]] = fadd float [[RES]], [[PHI]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[RDX]] +; + + entry: br label %for.body @@ -480,44 +1222,150 @@ ; Test to check masking correct, using the "llvm.loop.vectorize.predicate.enable" attribute define float @fadd_predicated(float* noalias nocapture %a, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_predicated -; CHECK-ORDERED: vector.ph -; CHECK-ORDERED: %[[TRIP_MINUS_ONE:.*]] = sub i64 %n, 1 -; CHECK-ORDERED: %[[BROADCAST_INS:.*]] = insertelement <2 x i64> poison, i64 %[[TRIP_MINUS_ONE]], i64 0 -; CHECK-ORDERED: %[[SPLAT:.*]] = shufflevector <2 x i64> %[[BROADCAST_INS]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-ORDERED: vector.body -; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %pred.load.continue2 ] -; CHECK-ORDERED: pred.load.continue2 -; CHECK-ORDERED: %[[PHI:.*]] = phi <2 x float> [ %[[PHI0:.*]], %pred.load.continue ], [ %[[INS_ELT:.*]], %pred.load.if1 ] -; CHECK-ORDERED: %[[MASK:.*]] = select <2 x i1> %0, <2 x float> %[[PHI]], <2 x float> -; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.v2f32(float %[[RDX_PHI]], <2 x float> %[[MASK]]) -; CHECK-ORDERED: for.end: -; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD:.*]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-ORDERED: ret float %[[RES_PHI]] - -; CHECK-UNORDERED-LABEL: @fadd_predicated -; CHECK-UNORDERED: vector.ph -; CHECK-UNORDERED: %[[TRIP_MINUS_ONE:.*]] = sub i64 %n, 1 -; CHECK-UNORDERED: %[[BROADCAST_INS:.*]] = insertelement <2 x i64> poison, i64 %[[TRIP_MINUS_ONE]], i64 0 -; CHECK-UNORDERED: %[[SPLAT:.*]] = shufflevector <2 x i64> %[[BROADCAST_INS]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi <2 x float> [ , %vector.ph ], [ %[[FADD:.*]], %pred.load.continue2 ] -; CHECK-UNORDERED: %[[ICMP:.*]] = icmp ule <2 x i64> %vec.ind, %[[SPLAT]] -; CHECK-UNORDERED: pred.load.continue2 -; CHECK-UNORDERED: %[[FADD]] = fadd <2 x float> %[[RDX_PHI]], {{.*}} -; CHECK-UNORDERED: %[[MASK:.*]] = select <2 x i1> %[[ICMP]], <2 x float> %[[FADD]], <2 x float> %[[RDX_PHI]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> %[[MASK]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[LOAD:.*]] = load float, float* -; CHECK-UNORDERED: %[[FADD2:.*]] = fadd float {{.*}}, %[[LOAD]] -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float %[[SUM]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_predicated -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_predicated +; CHECK-NOT-VECTORIZED-SAME: (float* noalias nocapture [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_02:%.*]] = phi float [ [[L7:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[ENTRY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[L2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[L3:%.*]] = load float, float* [[L2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[L7]] = fadd float [[SUM_02]], [[L3]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_0_LCSSA:%.*]] = phi float [ [[L7]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[SUM_0_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_predicated +; CHECK-UNORDERED-SAME: (float* noalias nocapture [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 1 +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-UNORDERED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-UNORDERED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ] +; CHECK-UNORDERED-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE2]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <2 x float> [ , [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[PRED_LOAD_CONTINUE2]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0 +; CHECK-UNORDERED-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-UNORDERED: pred.load.if: +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = load float, float* [[TMP3]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i32 0 +; CHECK-UNORDERED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK-UNORDERED: pred.load.continue: +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_LOAD_IF]] ] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 +; CHECK-UNORDERED-NEXT: br i1 [[TMP7]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]] +; CHECK-UNORDERED: pred.load.if1: +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 1 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP8]] +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = load float, float* [[TMP9]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP10]], i32 1 +; CHECK-UNORDERED-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK-UNORDERED: pred.load.continue2: +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = phi <2 x float> [ [[TMP6]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], [[PRED_LOAD_IF1]] ] +; CHECK-UNORDERED-NEXT: [[TMP13]] = fadd <2 x float> [[VEC_PHI]], [[TMP12]] +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = select <2 x i1> [[TMP0]], <2 x float> [[TMP13]], <2 x float> [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK-UNORDERED-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = call float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> [[TMP14]]) +; CHECK-UNORDERED-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-UNORDERED-NEXT: [[SUM_02:%.*]] = phi float [ [[L7:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-UNORDERED-NEXT: [[L2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[L3:%.*]] = load float, float* [[L2]], align 4 +; CHECK-UNORDERED-NEXT: [[L7]] = fadd float [[SUM_02]], [[L3]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[SUM_0_LCSSA:%.*]] = phi float [ [[L7]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[SUM_0_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fadd_predicated +; CHECK-ORDERED-SAME: (float* noalias nocapture [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 1 +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-ORDERED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-ORDERED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ] +; CHECK-ORDERED-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE2]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[PRED_LOAD_CONTINUE2]] ] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0 +; CHECK-ORDERED-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-ORDERED: pred.load.if: +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = load float, float* [[TMP3]], align 4 +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i32 0 +; CHECK-ORDERED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK-ORDERED: pred.load.continue: +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_LOAD_IF]] ] +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 +; CHECK-ORDERED-NEXT: br i1 [[TMP7]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]] +; CHECK-ORDERED: pred.load.if1: +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 1 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP8]] +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = load float, float* [[TMP9]], align 4 +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP10]], i32 1 +; CHECK-ORDERED-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK-ORDERED: pred.load.continue2: +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = phi <2 x float> [ [[TMP6]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], [[PRED_LOAD_IF1]] ] +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = select <2 x i1> [[TMP0]], <2 x float> [[TMP12]], <2 x float> +; CHECK-ORDERED-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.v2f32(float [[VEC_PHI]], <2 x float> [[TMP13]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK-ORDERED-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-ORDERED-NEXT: [[SUM_02:%.*]] = phi float [ [[L7:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-ORDERED-NEXT: [[L2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[L3:%.*]] = load float, float* [[L2]], align 4 +; CHECK-ORDERED-NEXT: [[L7]] = fadd float [[SUM_02]], [[L3]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[SUM_0_LCSSA:%.*]] = phi float [ [[L7]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[SUM_0_LCSSA]] +; + + entry: br label %for.body @@ -539,30 +1387,98 @@ ; Negative test - loop contains multiple fadds which we cannot safely reorder define float @fadd_multiple(float* noalias nocapture %a, float* noalias nocapture %b, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_multiple -; CHECK-ORDERED-NOT: vector.body - -; CHECK-UNORDERED-LABEL: @fadd_multiple -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[PHI:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <8 x float>, <8 x float> -; CHECK-UNORDERED: %[[VEC_FADD1:.*]] = fadd <8 x float> %[[PHI]], %[[VEC_LOAD1]] -; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <8 x float>, <8 x float> -; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <8 x float> %[[VEC_FADD1]], %[[VEC_LOAD2]] -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[VEC_FADD2]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD2:.*]], %for.body ] -; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float* -; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %sum, %[[LOAD1]] -; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float* -; CHECK-UNORDERED: %[[FADD2]] = fadd float %[[FADD1]], %[[LOAD2]] -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float %[[RET]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_multiple +; CHECK-NOT-VECTORIZED-SAME: (float* noalias nocapture [[A:%.*]], float* noalias nocapture [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD3]] = fadd float [[ADD]], [[TMP1]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[RDX]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_multiple +; CHECK-UNORDERED-SAME: (float* noalias nocapture [[A:%.*]], float* noalias nocapture [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = fadd <8 x float> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP7]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP8]] = fadd <8 x float> [[TMP4]], [[WIDE_LOAD1]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP8]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP11]] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD3]] = fadd float [[ADD]], [[TMP12]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[RDX]] +; +; CHECK-ORDERED-LABEL: define float @fadd_multiple +; CHECK-ORDERED-SAME: (float* noalias nocapture [[A:%.*]], float* noalias nocapture [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]] +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[ADD3]] = fadd float [[ADD]], [[TMP1]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: ret float [[RDX]] +; + + entry: br label %for.body @@ -588,30 +1504,98 @@ ; Negative test - loop contains two fadds and only one fadd has the fast flag, ; which we cannot safely reorder. define float @fadd_multiple_one_flag(float* noalias nocapture %a, float* noalias nocapture %b, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_multiple_one_flag -; CHECK-ORDERED-NOT: vector.body - -; CHECK-UNORDERED-LABEL: @fadd_multiple_one_flag -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[PHI:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <8 x float>, <8 x float> -; CHECK-UNORDERED: %[[VEC_FADD1:.*]] = fadd <8 x float> %[[PHI]], %[[VEC_LOAD1]] -; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <8 x float>, <8 x float> -; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd fast <8 x float> %[[VEC_FADD1]], %[[VEC_LOAD2]] -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[VEC_FADD2]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD2:.*]], %for.body ] -; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float* -; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %sum, %[[LOAD1]] -; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float* -; CHECK-UNORDERED: %[[FADD2]] = fadd fast float %[[FADD1]], %[[LOAD2]] -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float %[[RET]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple_one_flag -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_multiple_one_flag +; CHECK-NOT-VECTORIZED-SAME: (float* noalias nocapture [[A:%.*]], float* noalias nocapture [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD3]] = fadd fast float [[ADD]], [[TMP1]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[RDX]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_multiple_one_flag +; CHECK-UNORDERED-SAME: (float* noalias nocapture [[A:%.*]], float* noalias nocapture [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = fadd <8 x float> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP7]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP8]] = fadd fast <8 x float> [[TMP4]], [[WIDE_LOAD1]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP8]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP11]] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD3]] = fadd fast float [[ADD]], [[TMP12]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[RDX]] +; +; CHECK-ORDERED-LABEL: define float @fadd_multiple_one_flag +; CHECK-ORDERED-SAME: (float* noalias nocapture [[A:%.*]], float* noalias nocapture [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]] +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[ADD3]] = fadd fast float [[ADD]], [[TMP1]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: ret float [[RDX]] +; + + entry: br label %for.body @@ -653,14 +1637,71 @@ ; Note: This test does not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even ; with the -hints-allow-reordering flag set to true. define float @induction_and_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, i64 %N) { -; CHECK-ORDERED-LABEL: @induction_and_reduction -; CHECK-ORDERED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @induction_and_reduction +; CHECK-NOT-VECTORIZED-SAME: (float* nocapture readonly [[VALUES:%.*]], float [[INIT:%.*]], float* noalias nocapture [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_015:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[X_014:%.*]] = phi float [ [[INIT]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: store float [[X_014]], float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd float [[X_014]], 2.000000e+00 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD3]] = fadd float [[SUM_015]], [[TMP0]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD3_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @induction_and_reduction +; CHECK-UNORDERED-SAME: (float* nocapture readonly [[VALUES:%.*]], float [[INIT:%.*]], float* noalias nocapture [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_015:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[X_014:%.*]] = phi float [ [[INIT]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: store float [[X_014]], float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD]] = fadd float [[X_014]], 2.000000e+00 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD3]] = fadd float [[SUM_015]], [[TMP0]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: ret float [[ADD3_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @induction_and_reduction +; CHECK-ORDERED-SAME: (float* nocapture readonly [[VALUES:%.*]], float [[INIT:%.*]], float* noalias nocapture [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_015:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[X_014:%.*]] = phi float [ [[INIT]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: store float [[X_014]], float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD]] = fadd float [[X_014]], 2.000000e+00 +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[ADD3]] = fadd float [[SUM_015]], [[TMP0]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: ret float [[ADD3_LCSSA]] +; -; CHECK-UNORDERED-LABEL: @induction_and_reduction -; CHECK-UNORDERED-NOT: vector.body -; CHECK-NOT-VECTORIZED-LABEL: @induction_and_reduction -; CHECK-NOT-VECTORIZED-NOT: vector.body entry: br label %for.body @@ -685,50 +1726,146 @@ ; As above, but with the FP induction being unordered (fast) the loop can be vectorized with strict reductions define float @fast_induction_and_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, i64 %N) { -; CHECK-ORDERED-LABEL: @fast_induction_and_reduction -; CHECK-ORDERED: vector.ph -; CHECK-ORDERED: %[[INDUCTION:.*]] = fadd fast <4 x float> {{.*}}, -; CHECK-ORDERED: vector.body -; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[FADD2:.*]], %vector.body ] -; CHECK-ORDERED: %[[IND_PHI:.*]] = phi <4 x float> [ %[[INDUCTION]], %vector.ph ], [ %[[VEC_IND_NEXT:.*]], %vector.body ] -; CHECK-ORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: %[[FADD1:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[RDX_PHI]], <4 x float> %[[LOAD1]]) -; CHECK-ORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[IND_PHI]], -; CHECK-ORDERED: for.body -; CHECK-ORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD2:.*]], %for.body ] -; CHECK-ORDERED: %[[IND_SUM_PHI:.*]] = phi fast float [ {{.*}}, %scalar.ph ], [ %[[ADD_IND:.*]], %for.body ] -; CHECK-ORDERED: store float %[[IND_SUM_PHI]], float* -; CHECK-ORDERED: %[[ADD_IND]] = fadd fast float %[[IND_SUM_PHI]], 2.000000e+00 -; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float* -; CHECK-ORDERED: %[[FADD2]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD2]] -; CHECK-ORDERED: for.end -; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[FADD1]], %middle.block ] -; CHECK-ORDERED: ret float %[[RES_PHI]] - -; CHECK-UNORDERED-LABEL: @fast_induction_and_reduction -; CHECK-UNORDERED: vector.ph -; CHECK-UNORDERED: %[[INDUCTION:.*]] = fadd fast <4 x float> {{.*}}, -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi <4 x float> [ , %vector.ph ], [ %[[VEC_FADD:.*]], %vector.body ] -; CHECK-UNORDERED: %[[IND_PHI:.*]] = phi <4 x float> [ %[[INDUCTION]], %vector.ph ], [ %[[VEC_IND_NEXT:.*]], %vector.body ] -; CHECK-UNORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: %[[VEC_FADD]] = fadd <4 x float> %[[RDX_PHI]], %[[LOAD1]] -; CHECK-UNORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[IND_PHI]], -; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED: %[[VEC_RDX:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD]]) -; CHECK-UNORDERED: for.body: -; CHECK-UNORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD:.*]], %for.body ] -; CHECK-UNORDERED: %[[IND_SUM_PHI:.*]] = phi fast float [ {{.*}}, %scalar.ph ], [ %[[ADD_IND:.*]], %for.body ] -; CHECK-UNORDERED: store float %[[IND_SUM_PHI]], float* -; CHECK-UNORDERED: %[[ADD_IND]] = fadd fast float %[[IND_SUM_PHI]], 2.000000e+00 -; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float* -; CHECK-UNORDERED: %[[FADD]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD2]] -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[VEC_RDX]], %middle.block ] -; CHECK-UNORDERED: ret float %[[RES_PHI]] - -; CHECK-NOT-VECTORIZED-LABEL: @fast_induction_and_reduction -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fast_induction_and_reduction +; CHECK-NOT-VECTORIZED-SAME: (float* nocapture readonly [[VALUES:%.*]], float [[INIT:%.*]], float* noalias nocapture [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_015:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[X_014:%.*]] = phi fast float [ [[INIT]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: store float [[X_014]], float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd fast float [[X_014]], 2.000000e+00 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD3]] = fadd float [[SUM_015]], [[TMP0]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD3_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fast_induction_and_reduction +; CHECK-UNORDERED-SAME: (float* nocapture readonly [[VALUES:%.*]], float [[INIT:%.*]], float* noalias nocapture [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = fmul fast float 2.000000e+00, [[DOTCAST]] +; CHECK-UNORDERED-NEXT: [[IND_END:%.*]] = fadd fast float [[INIT]], [[TMP0]] +; CHECK-UNORDERED-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 +; CHECK-UNORDERED-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-UNORDERED-NEXT: [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>* +; CHECK-UNORDERED-NEXT: store <4 x float> [[VEC_IND]], <4 x float>* [[TMP4]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[VALUES]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP8]] = fadd <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-UNORDERED-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP8]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL1:%.*]] = phi float [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[INIT]], [[ENTRY]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_015:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[X_014:%.*]] = phi fast float [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: store float [[X_014]], float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD]] = fadd fast float [[X_014]], 2.000000e+00 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD3]] = fadd float [[SUM_015]], [[TMP11]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[ADD3_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fast_induction_and_reduction +; CHECK-ORDERED-SAME: (float* nocapture readonly [[VALUES:%.*]], float [[INIT:%.*]], float* noalias nocapture [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = fmul fast float 2.000000e+00, [[DOTCAST]] +; CHECK-ORDERED-NEXT: [[IND_END:%.*]] = fadd fast float [[INIT]], [[TMP0]] +; CHECK-ORDERED-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 +; CHECK-ORDERED-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-ORDERED-NEXT: [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>* +; CHECK-ORDERED-NEXT: store <4 x float> [[VEC_IND]], <4 x float>* [[TMP4]], align 4 +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[VALUES]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4 +; CHECK-ORDERED-NEXT: [[TMP8]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-ORDERED-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL1:%.*]] = phi float [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[INIT]], [[ENTRY]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_015:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[X_014:%.*]] = phi fast float [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: store float [[X_014]], float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD]] = fadd fast float [[X_014]], 2.000000e+00 +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[ADD3]] = fadd float [[SUM_015]], [[TMP10]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[ADD3_LCSSA]] +; + + entry: br label %for.body @@ -755,15 +1892,83 @@ ; Note: This test does not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even ; with the -hints-allow-reordering flag set to true. define float @fast_induction_unordered_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, float* noalias nocapture %B, i64 %N) { +; CHECK-NOT-VECTORIZED-LABEL: define float @fast_induction_unordered_reduction +; CHECK-NOT-VECTORIZED-SAME: (float* nocapture readonly [[VALUES:%.*]], float [[INIT:%.*]], float* noalias nocapture [[A:%.*]], float* noalias nocapture [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM2_023:%.*]] = phi float [ 3.000000e+00, [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_022:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[X_021:%.*]] = phi float [ [[INIT]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: store float [[X_021]], float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd fast float [[X_021]], 2.000000e+00 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD3]] = fadd float [[SUM_022]], [[TMP0]] +; CHECK-NOT-VECTORIZED-NEXT: [[MUL]] = fmul float [[SUM2_023]], [[TMP0]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ADD6:%.*]] = fadd float [[ADD3_LCSSA]], [[MUL_LCSSA]] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD6]] +; +; CHECK-UNORDERED-LABEL: define float @fast_induction_unordered_reduction +; CHECK-UNORDERED-SAME: (float* nocapture readonly [[VALUES:%.*]], float [[INIT:%.*]], float* noalias nocapture [[A:%.*]], float* noalias nocapture [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM2_023:%.*]] = phi float [ 3.000000e+00, [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_022:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[X_021:%.*]] = phi float [ [[INIT]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: store float [[X_021]], float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD]] = fadd fast float [[X_021]], 2.000000e+00 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD3]] = fadd float [[SUM_022]], [[TMP0]] +; CHECK-UNORDERED-NEXT: [[MUL]] = fmul float [[SUM2_023]], [[TMP0]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ADD6:%.*]] = fadd float [[ADD3_LCSSA]], [[MUL_LCSSA]] +; CHECK-UNORDERED-NEXT: ret float [[ADD6]] +; +; CHECK-ORDERED-LABEL: define float @fast_induction_unordered_reduction +; CHECK-ORDERED-SAME: (float* nocapture readonly [[VALUES:%.*]], float [[INIT:%.*]], float* noalias nocapture [[A:%.*]], float* noalias nocapture [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM2_023:%.*]] = phi float [ 3.000000e+00, [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_022:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[X_021:%.*]] = phi float [ [[INIT]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: store float [[X_021]], float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD]] = fadd fast float [[X_021]], 2.000000e+00 +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[ADD3]] = fadd float [[SUM_022]], [[TMP0]] +; CHECK-ORDERED-NEXT: [[MUL]] = fmul float [[SUM2_023]], [[TMP0]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ADD6:%.*]] = fadd float [[ADD3_LCSSA]], [[MUL_LCSSA]] +; CHECK-ORDERED-NEXT: ret float [[ADD6]] +; -; CHECK-ORDERED-LABEL: @fast_induction_unordered_reduction -; CHECK-ORDERED-NOT: vector.body - -; CHECK-UNORDERED-LABEL: @fast_induction_unordered_reduction -; CHECK-UNORDERED-NOT: vector.body -; CHECK-NOT-VECTORIZED-LABEL: @fast_induction_unordered_reduction -; CHECK-NOT-VECTORIZED-NOT: vector.body entry: br label %for.body @@ -791,59 +1996,133 @@ ; Test reductions for a VF of 1 and a UF > 1. define float @fadd_scalar_vf(float* noalias nocapture readonly %a, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_scalar_vf -; CHECK-ORDERED: vector.body -; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, {{.*}} ], [ %[[FADD4:.*]], %vector.body ] -; CHECK-ORDERED: %[[LOAD1:.*]] = load float, float* -; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float* -; CHECK-ORDERED: %[[LOAD3:.*]] = load float, float* -; CHECK-ORDERED: %[[LOAD4:.*]] = load float, float* -; CHECK-ORDERED: %[[FADD1:.*]] = fadd float %[[VEC_PHI]], %[[LOAD1]] -; CHECK-ORDERED: %[[FADD2:.*]] = fadd float %[[FADD1]], %[[LOAD2]] -; CHECK-ORDERED: %[[FADD3:.*]] = fadd float %[[FADD2]], %[[LOAD3]] -; CHECK-ORDERED: %[[FADD4]] = fadd float %[[FADD3]], %[[LOAD4]] -; CHECK-ORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-ORDERED: scalar.ph -; CHECK-ORDERED: %[[MERGE_RDX:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[FADD4]], %middle.block ] -; CHECK-ORDERED: for.body -; CHECK-ORDERED: %[[SUM_PHI:.*]] = phi float [ %[[MERGE_RDX]], %scalar.ph ], [ %[[FADD5:.*]], %for.body ] -; CHECK-ORDERED: %[[LOAD5:.*]] = load float, float* -; CHECK-ORDERED: %[[FADD5]] = fadd float %[[LOAD5]], %[[SUM_PHI]] -; CHECK-ORDERED: for.end -; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD5]], %for.body ], [ %[[FADD4]], %middle.block ] -; CHECK-ORDERED: ret float %[[RES_PHI]] - -; CHECK-UNORDERED-LABEL: @fadd_scalar_vf -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[FADD1:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[FADD2:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI3:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[FADD3:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI4:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[FADD4:.*]], %vector.body ] -; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float* -; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float* -; CHECK-UNORDERED: %[[LOAD3:.*]] = load float, float* -; CHECK-UNORDERED: %[[LOAD4:.*]] = load float, float* -; CHECK-UNORDERED: %[[FADD1]] = fadd float %[[LOAD1]], %[[VEC_PHI1]] -; CHECK-UNORDERED: %[[FADD2]] = fadd float %[[LOAD2]], %[[VEC_PHI2]] -; CHECK-UNORDERED: %[[FADD3]] = fadd float %[[LOAD3]], %[[VEC_PHI3]] -; CHECK-UNORDERED: %[[FADD4]] = fadd float %[[LOAD4]], %[[VEC_PHI4]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd float %[[FADD2]], %[[FADD1]] -; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd float %[[FADD3]], %[[BIN_RDX1]] -; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd float %[[FADD4]], %[[BIN_RDX2]] -; CHECK-UNORDERED: scalar.ph -; CHECK-UNORDERED: %[[MERGE_RDX:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[BIN_RDX3]], %middle.block ] -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[SUM_PHI:.*]] = phi float [ %[[MERGE_RDX]], %scalar.ph ], [ %[[FADD5:.*]], %for.body ] -; CHECK-UNORDERED: %[[LOAD5:.*]] = load float, float* -; CHECK-UNORDERED: %[[FADD5]] = fadd float %[[LOAD5]], %[[SUM_PHI]] -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD5]], %for.body ], [ %[[BIN_RDX3]], %middle.block ] -; CHECK-UNORDERED: ret float %[[RES_PHI]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_scalar_vf -; CHECK-NOT-VECTORIZED-NOT: @vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_scalar_vf +; CHECK-NOT-VECTORIZED-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd float [[TMP0]], [[SUM_07]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_scalar_vf +; CHECK-UNORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = load float, float* [[TMP4]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = load float, float* [[TMP5]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = load float, float* [[TMP6]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = load float, float* [[TMP7]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP12]] = fadd float [[TMP8]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[TMP13]] = fadd float [[TMP9]], [[VEC_PHI1]] +; CHECK-UNORDERED-NEXT: [[TMP14]] = fadd float [[TMP10]], [[VEC_PHI2]] +; CHECK-UNORDERED-NEXT: [[TMP15]] = fadd float [[TMP11]], [[VEC_PHI3]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd float [[TMP13]], [[TMP12]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX4:%.*]] = fadd float [[TMP14]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX5:%.*]] = fadd float [[TMP15]], [[BIN_RDX4]] +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[BIN_RDX5]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD]] = fadd float [[TMP17]], [[SUM_07]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[BIN_RDX5]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fadd_scalar_vf +; CHECK-ORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]] +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = load float, float* [[TMP4]], align 4 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = load float, float* [[TMP5]], align 4 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = load float, float* [[TMP6]], align 4 +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = load float, float* [[TMP7]], align 4 +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = fadd float [[VEC_PHI]], [[TMP8]] +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = fadd float [[TMP12]], [[TMP9]] +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = fadd float [[TMP13]], [[TMP10]] +; CHECK-ORDERED-NEXT: [[TMP15]] = fadd float [[TMP14]], [[TMP11]] +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD]] = fadd float [[TMP17]], [[SUM_07]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]] +; + + entry: br label %for.body @@ -864,59 +2143,134 @@ ; Same as above but where fadd has a fast-math flag. define float @fadd_scalar_vf_fmf(float* noalias nocapture readonly %a, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_scalar_vf_fmf -; CHECK-ORDERED: vector.body: -; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FADD4:%.*]], %vector.body ] -; CHECK-ORDERED: [[LOAD1:%.*]] = load float, float* -; CHECK-ORDERED: [[LOAD2:%.*]] = load float, float* -; CHECK-ORDERED: [[LOAD3:%.*]] = load float, float* -; CHECK-ORDERED: [[LOAD4:%.*]] = load float, float* -; CHECK-ORDERED: [[FADD1:%.*]] = fadd nnan float [[VEC_PHI]], [[LOAD1]] -; CHECK-ORDERED: [[FADD2:%.*]] = fadd nnan float [[FADD1]], [[LOAD2]] -; CHECK-ORDERED: [[FADD3:%.*]] = fadd nnan float [[FADD2]], [[LOAD3]] -; CHECK-ORDERED: [[FADD4]] = fadd nnan float [[FADD3]], [[LOAD4]] -; CHECK-ORDERED-NOT: @llvm.vector.reduce.fadd -; CHECK-ORDERED: scalar.ph: -; CHECK-ORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[FADD4]], %middle.block ] -; CHECK-ORDERED: for.body: -; CHECK-ORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[FADD5:%.*]], %for.body ] -; CHECK-ORDERED: [[LOAD5:%.*]] = load float, float* -; CHECK-ORDERED: [[FADD5]] = fadd nnan float [[LOAD5]], [[SUM_07]] -; CHECK-ORDERED: for.end: -; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[FADD5]], %for.body ], [ [[FADD4]], %middle.block ] -; CHECK-ORDERED: ret float [[RES]] - -; CHECK-UNORDERED-LABEL: @fadd_scalar_vf_fmf -; CHECK-UNORDERED: vector.body: -; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FADD1:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FADD2:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FADD3:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI4:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FADD4:%.*]], %vector.body ] -; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, float* -; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, float* -; CHECK-UNORDERED: [[LOAD3:%.*]] = load float, float* -; CHECK-UNORDERED: [[LOAD4:%.*]] = load float, float* -; CHECK-UNORDERED: [[FADD1]] = fadd nnan float [[LOAD1]], [[VEC_PHI1]] -; CHECK-UNORDERED: [[FADD2]] = fadd nnan float [[LOAD2]], [[VEC_PHI2]] -; CHECK-UNORDERED: [[FADD3]] = fadd nnan float [[LOAD3]], [[VEC_PHI3]] -; CHECK-UNORDERED: [[FADD4]] = fadd nnan float [[LOAD4]], [[VEC_PHI4]] -; CHECK-UNORDERED-NOT: @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan float [[FADD2]], [[FADD1]] -; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan float [[FADD3]], [[BIN_RDX1]] -; CHECK-UNORDERED: [[BIN_RDX3:%.*]] = fadd nnan float [[FADD4]], [[BIN_RDX2]] -; CHECK-UNORDERED: scalar.ph: -; CHECK-UNORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[BIN_RDX3]], %middle.block ] -; CHECK-UNORDERED: for.body: -; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[FADD5:%.*]], %for.body ] -; CHECK-UNORDERED: [[LOAD5:%.*]] = load float, float* -; CHECK-UNORDERED: [[FADD5]] = fadd nnan float [[LOAD5]], [[SUM_07]] +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_scalar_vf_fmf +; CHECK-NOT-VECTORIZED-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd nnan float [[TMP0]], [[SUM_07]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP11]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_scalar_vf_fmf +; CHECK-UNORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = load float, float* [[TMP4]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = load float, float* [[TMP5]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = load float, float* [[TMP6]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = load float, float* [[TMP7]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP12]] = fadd nnan float [[TMP8]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[TMP13]] = fadd nnan float [[TMP9]], [[VEC_PHI1]] +; CHECK-UNORDERED-NEXT: [[TMP14]] = fadd nnan float [[TMP10]], [[VEC_PHI2]] +; CHECK-UNORDERED-NEXT: [[TMP15]] = fadd nnan float [[TMP11]], [[VEC_PHI3]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd nnan float [[TMP13]], [[TMP12]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX4:%.*]] = fadd nnan float [[TMP14]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX5:%.*]] = fadd nnan float [[TMP15]], [[BIN_RDX4]] +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[BIN_RDX5]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD]] = fadd nnan float [[TMP17]], [[SUM_07]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[BIN_RDX5]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fadd_scalar_vf_fmf +; CHECK-ORDERED-SAME: (float* noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]] +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = load float, float* [[TMP4]], align 4 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = load float, float* [[TMP5]], align 4 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = load float, float* [[TMP6]], align 4 +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = load float, float* [[TMP7]], align 4 +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = fadd nnan float [[VEC_PHI]], [[TMP8]] +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = fadd nnan float [[TMP12]], [[TMP9]] +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = fadd nnan float [[TMP13]], [[TMP10]] +; CHECK-ORDERED-NEXT: [[TMP15]] = fadd nnan float [[TMP14]], [[TMP11]] +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD]] = fadd nnan float [[TMP17]], [[SUM_07]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]] +; + ; CHECK-UORDERED: for.end -; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[FADD5]], %for.body ], [ [[BIN_RDX3]], %middle.block ] -; CHECK-UNORDERED: ret float [[RES]] -; CHECK-NOT-VECTORIZED-LABEL: @fadd_scalar_vf_fmf -; CHECK-NOT-VECTORIZED-NOT: vector.body entry: br label %for.body @@ -937,30 +2291,100 @@ ; Test case where the reduction step is a first-order recurrence. define double @reduction_increment_by_first_order_recurrence() { -; CHECK-ORDERED-LABEL: @reduction_increment_by_first_order_recurrence( -; CHECK-ORDERED: vector.body: -; CHECK-ORDERED: [[RED:%.*]] = phi double [ 0.000000e+00, %vector.ph ], [ [[RED_NEXT:%.*]], %vector.body ] -; CHECK-ORDERED: [[VECTOR_RECUR:%.*]] = phi <4 x double> [ , %vector.ph ], [ [[FOR_NEXT:%.*]], %vector.body ] -; CHECK-ORDERED: [[FOR_NEXT]] = sitofp <4 x i32> %vec.ind to <4 x double> -; CHECK-ORDERED: [[TMP1:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[FOR_NEXT]], <4 x i32> -; CHECK-ORDERED: [[RED_NEXT]] = call double @llvm.vector.reduce.fadd.v4f64(double [[RED]], <4 x double> [[TMP1]]) -; CHECK-ORDERED: scalar.ph: -; CHECK-ORDERED: = phi double [ 0.000000e+00, %entry ], [ [[RED_NEXT]], %middle.block ] -; -; CHECK-UNORDERED-LABEL: @reduction_increment_by_first_order_recurrence( -; CHECK-UNORDERED: vector.body: -; CHECK-UNORDERED: [[RED:%.*]] = phi <4 x double> [ , %vector.ph ], [ [[RED_NEXT:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VECTOR_RECUR:%.*]] = phi <4 x double> [ , %vector.ph ], [ [[FOR_NEXT:%.*]], %vector.body ] -; CHECK-UNORDERED: [[FOR_NEXT]] = sitofp <4 x i32> %vec.ind to <4 x double> -; CHECK-UNORDERED: [[TMP1:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[FOR_NEXT]], <4 x i32> -; CHECK-UNORDERED: [[RED_NEXT]] = fadd <4 x double> [[TMP1]], [[RED]] -; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED: [[RDX:%.*]] = call double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[RED_NEXT]]) -; CHECK-UNORDERED: scalar.ph: -; CHECK-UNORDERED: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, %entry ], [ [[RDX]], %middle.block ] -; -; CHECK-NOT-VECTORIZED-LABEL: @reduction_increment_by_first_order_recurrence( -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define double @reduction_increment_by_first_order_recurrence() { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[LOOP:%.*]] +; CHECK-NOT-VECTORIZED: loop: +; CHECK-NOT-VECTORIZED-NEXT: [[RED:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[FOR:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[RED_NEXT]] = fadd double [[FOR]], [[RED]] +; CHECK-NOT-VECTORIZED-NEXT: [[FOR_NEXT]] = sitofp i32 [[IV]] to double +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0 +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NOT-VECTORIZED: exit: +; CHECK-NOT-VECTORIZED-NEXT: [[RES:%.*]] = phi double [ [[RED_NEXT]], [[LOOP]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret double [[RES]] +; +; CHECK-UNORDERED-LABEL: define double @reduction_increment_by_first_order_recurrence() { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ , [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x double> [ , [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double> +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[TMP0]], <4 x i32> +; CHECK-UNORDERED-NEXT: [[TMP2]] = fadd <4 x double> [[TMP1]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-UNORDERED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 +; CHECK-UNORDERED-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = call double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP2]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i32 0, 0 +; CHECK-UNORDERED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[TMP0]], i32 3 +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[LOOP:%.*]] +; CHECK-UNORDERED: loop: +; CHECK-UNORDERED-NEXT: [[RED:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] +; CHECK-UNORDERED-NEXT: [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-UNORDERED-NEXT: [[RED_NEXT]] = fadd double [[SCALAR_RECUR]], [[RED]] +; CHECK-UNORDERED-NEXT: [[FOR_NEXT]] = sitofp i32 [[IV]] to double +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0 +; CHECK-UNORDERED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] +; CHECK-UNORDERED: exit: +; CHECK-UNORDERED-NEXT: [[RES:%.*]] = phi double [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret double [[RES]] +; +; CHECK-ORDERED-LABEL: define double @reduction_increment_by_first_order_recurrence() { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x double> [ , [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP0]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double> +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[TMP0]], <4 x i32> +; CHECK-ORDERED-NEXT: [[TMP2]] = call double @llvm.vector.reduce.fadd.v4f64(double [[VEC_PHI]], <4 x double> [[TMP1]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-ORDERED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 +; CHECK-ORDERED-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i32 0, 0 +; CHECK-ORDERED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[TMP0]], i32 3 +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[LOOP:%.*]] +; CHECK-ORDERED: loop: +; CHECK-ORDERED-NEXT: [[RED:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] +; CHECK-ORDERED-NEXT: [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-ORDERED-NEXT: [[RED_NEXT]] = fadd double [[SCALAR_RECUR]], [[RED]] +; CHECK-ORDERED-NEXT: [[FOR_NEXT]] = sitofp i32 [[IV]] to double +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0 +; CHECK-ORDERED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] +; CHECK-ORDERED: exit: +; CHECK-ORDERED-NEXT: [[RES:%.*]] = phi double [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret double [[RES]] ; entry: br label %loop @@ -983,14 +2407,76 @@ ; We should not mark the fadd as an ordered reduction here as there are ; more than 2 uses of the instruction define float @fadd_multiple_use(i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_multiple_use +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_multiple_use +; CHECK-NOT-VECTORIZED-SAME: (i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT2:%.*]], [[BB2:%.*]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[FADD:%.*]], [[BB2]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[PHI1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[BB2]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[FADD]] = fadd float [[RED]], 1.000000e+00 +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nsw i64 [[PHI1]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[CMP]], label [[BB2]], label [[BB1:%.*]] +; CHECK-NOT-VECTORIZED: bb1: +; CHECK-NOT-VECTORIZED-NEXT: [[PHI2:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[PHI2]] +; CHECK-NOT-VECTORIZED: bb2: +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT2]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: br i1 false, label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[PHI3:%.*]] = phi float [ [[FADD]], [[BB2]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[PHI3]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_multiple_use +; CHECK-UNORDERED-SAME: (i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT2:%.*]], [[BB2:%.*]] ] +; CHECK-UNORDERED-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[FADD:%.*]], [[BB2]] ] +; CHECK-UNORDERED-NEXT: [[PHI1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[BB2]] ] +; CHECK-UNORDERED-NEXT: [[FADD]] = fadd float [[RED]], 1.000000e+00 +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nsw i64 [[PHI1]], 1 +; CHECK-UNORDERED-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP]], label [[BB2]], label [[BB1:%.*]] +; CHECK-UNORDERED: bb1: +; CHECK-UNORDERED-NEXT: [[PHI2:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: ret float [[PHI2]] +; CHECK-UNORDERED: bb2: +; CHECK-UNORDERED-NEXT: [[IV_NEXT2]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: br i1 false, label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[PHI3:%.*]] = phi float [ [[FADD]], [[BB2]] ] +; CHECK-UNORDERED-NEXT: ret float [[PHI3]] +; +; CHECK-ORDERED-LABEL: define float @fadd_multiple_use +; CHECK-ORDERED-SAME: (i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT2:%.*]], [[BB2:%.*]] ] +; CHECK-ORDERED-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[FADD:%.*]], [[BB2]] ] +; CHECK-ORDERED-NEXT: [[PHI1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[BB2]] ] +; CHECK-ORDERED-NEXT: [[FADD]] = fadd float [[RED]], 1.000000e+00 +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nsw i64 [[PHI1]], 1 +; CHECK-ORDERED-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[CMP]], label [[BB2]], label [[BB1:%.*]] +; CHECK-ORDERED: bb1: +; CHECK-ORDERED-NEXT: [[PHI2:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: ret float [[PHI2]] +; CHECK-ORDERED: bb2: +; CHECK-ORDERED-NEXT: [[IV_NEXT2]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: br i1 false, label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[PHI3:%.*]] = phi float [ [[FADD]], [[BB2]] ] +; CHECK-ORDERED-NEXT: ret float [[PHI3]] +; ; CHECK-ORDERED-LABEL-NOT: vector.body -; CHECK-UNORDERED-LABEL: @fadd_multiple_use ; CHECK-UNORDERED-LABEL-NOT: vector.body -; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple_use -; CHECK-NOT-VECTORIZED-NOT: vector.body entry: br label %for.body @@ -1019,59 +2505,192 @@ ; Test case where the loop has a call to the llvm.fmuladd intrinsic. define float @fmuladd_strict(float* %a, float* %b, i64 %n) { -; CHECK-ORDERED-LABEL: @fmuladd_strict -; CHECK-ORDERED: vector.body: -; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX3:%.*]], %vector.body ] -; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* -; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* -; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>* -; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>* -; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>* -; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* -; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* -; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <8 x float>, <8 x float>* -; CHECK-ORDERED: [[FMUL:%.*]] = fmul <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]] -; CHECK-ORDERED: [[FMUL1:%.*]] = fmul <8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]] -; CHECK-ORDERED: [[FMUL2:%.*]] = fmul <8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]] -; CHECK-ORDERED: [[FMUL3:%.*]] = fmul <8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]] -; CHECK-ORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[FMUL]]) -; CHECK-ORDERED: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX]], <8 x float> [[FMUL1]]) -; CHECK-ORDERED: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX1]], <8 x float> [[FMUL2]]) -; CHECK-ORDERED: [[RDX3]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX2]], <8 x float> [[FMUL3]]) -; CHECK-ORDERED: for.body: -; CHECK-ORDERED: [[SUM_07:%.*]] = phi float [ {{.*}}, %scalar.ph ], [ [[MULADD:%.*]], %for.body ] -; CHECK-ORDERED: [[LOAD:%.*]] = load float, float* -; CHECK-ORDERED: [[LOAD1:%.*]] = load float, float* -; CHECK-ORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]]) -; CHECK-ORDERED: for.end -; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX3]], %middle.block ] - -; CHECK-UNORDERED-LABEL: @fmuladd_strict -; CHECK-UNORDERED: vector.body: -; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <8 x float> [ , %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] -; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>* -; CHECK-UNORDERED: [[FMULADD]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD4]], <8 x float> [[VEC_PHI]]) -; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <8 x float> -; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <8 x float> -; CHECK-UNORDERED: [[BIN_RDX3:%.*]] = fadd <8 x float> -; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX3]]) -; CHECK-UNORDERED: for.body: -; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ {{.*}}, %scalar.ph ], [ [[MULADD:%.*]], %for.body ] -; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float* -; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, float* -; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD2]], float [[SUM_07]]) -; CHECK-UNORDERED: for.end: -; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float [[RES]] - -; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fmuladd_strict +; CHECK-NOT-VECTORIZED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]]) +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fmuladd_strict +; CHECK-UNORDERED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP31:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16 +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24 +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]] +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]] +; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x float>, <8 x float>* [[TMP21]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 8 +; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP22]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x float>, <8 x float>* [[TMP23]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 16 +; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD9:%.*]] = load <8 x float>, <8 x float>* [[TMP25]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 24 +; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x float>, <8 x float>* [[TMP27]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP28]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD7]], <8 x float> [[VEC_PHI]]) +; CHECK-UNORDERED-NEXT: [[TMP29]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD4]], <8 x float> [[WIDE_LOAD8]], <8 x float> [[VEC_PHI1]]) +; CHECK-UNORDERED-NEXT: [[TMP30]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD5]], <8 x float> [[WIDE_LOAD9]], <8 x float> [[VEC_PHI2]]) +; CHECK-UNORDERED-NEXT: [[TMP31]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD6]], <8 x float> [[WIDE_LOAD10]], <8 x float> [[VEC_PHI3]]) +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd <8 x float> [[TMP29]], [[TMP28]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX11:%.*]] = fadd <8 x float> [[TMP30]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX12:%.*]] = fadd <8 x float> [[TMP31]], [[BIN_RDX11]] +; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX12]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP34:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP35:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP34]], float [[TMP35]], float [[SUM_07]]) +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[MULADD_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fmuladd_strict +; CHECK-ORDERED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]] +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8 +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4 +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16 +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4 +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24 +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4 +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]] +; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]] +; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>* [[TMP21]], align 4 +; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 8 +; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP22]] to <8 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* [[TMP23]], align 4 +; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 16 +; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* [[TMP25]], align 4 +; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 24 +; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x float>, <8 x float>* [[TMP27]], align 4 +; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = fmul <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]] +; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = fmul <8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]] +; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = fmul <8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = fmul <8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]] +; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[TMP28]]) +; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP32]], <8 x float> [[TMP29]]) +; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP33]], <8 x float> [[TMP30]]) +; CHECK-ORDERED-NEXT: [[TMP35]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP34]], <8 x float> [[TMP31]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP37:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP37]], float [[TMP38]], float [[SUM_07]]) +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[MULADD_LCSSA]] +; + + entry: br label %for.body @@ -1094,73 +2713,159 @@ ; Test reductions for a VF of 1 and a UF > 1 where the loop has a call to the llvm.fmuladd intrinsic. define float @fmuladd_scalar_vf(float* %a, float* %b, i64 %n) { -; CHECK-ORDERED-LABEL: @fmuladd_scalar_vf -; CHECK-ORDERED: vector.body: -; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FADD3:%.*]], %vector.body ] -; CHECK-ORDERED: [[LOAD:%.*]] = load float, float* -; CHECK-ORDERED: [[LOAD1:%.*]] = load float, float* -; CHECK-ORDERED: [[LOAD2:%.*]] = load float, float* -; CHECK-ORDERED: [[LOAD3:%.*]] = load float, float* -; CHECK-ORDERED: [[LOAD4:%.*]] = load float, float* -; CHECK-ORDERED: [[LOAD5:%.*]] = load float, float* -; CHECK-ORDERED: [[LOAD6:%.*]] = load float, float* -; CHECK-ORDERED: [[LOAD7:%.*]] = load float, float* -; CHECK-ORDERED: [[FMUL:%.*]] = fmul float [[LOAD]], [[LOAD4]] -; CHECK-ORDERED: [[FMUL1:%.*]] = fmul float [[LOAD1]], [[LOAD5]] -; CHECK-ORDERED: [[FMUL2:%.*]] = fmul float [[LOAD2]], [[LOAD6]] -; CHECK-ORDERED: [[FMUL3:%.*]] = fmul float [[LOAD3]], [[LOAD7]] -; CHECK-ORDERED: [[FADD:%.*]] = fadd float [[VEC_PHI]], [[FMUL]] -; CHECK-ORDERED: [[FADD1:%.*]] = fadd float [[FADD]], [[FMUL1]] -; CHECK-ORDERED: [[FADD2:%.*]] = fadd float [[FADD1]], [[FMUL2]] -; CHECK-ORDERED: [[FADD3]] = fadd float [[FADD2]], [[FMUL3]] -; CHECK-ORDERED-NOT: llvm.vector.reduce.fadd -; CHECK-ORDERED: scalar.ph -; CHECK-ORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[FADD3]], %middle.block ] -; CHECK-ORDERED: for.body -; CHECK-ORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ] -; CHECK-ORDERED: [[LOAD8:%.*]] = load float, float* -; CHECK-ORDERED: [[LOAD9:%.*]] = load float, float* -; CHECK-ORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD8]], float [[LOAD9]], float [[SUM_07]]) -; CHECK-ORDERED: for.end -; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[FADD3]], %middle.block ] -; CHECK-ORDERED: ret float [[RES]] - -; CHECK-UNORDERED-LABEL: @fmuladd_scalar_vf -; CHECK-UNORDERED: vector.body: -; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ] -; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float* -; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, float* -; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, float* -; CHECK-UNORDERED: [[LOAD3:%.*]] = load float, float* -; CHECK-UNORDERED: [[LOAD4:%.*]] = load float, float* -; CHECK-UNORDERED: [[LOAD5:%.*]] = load float, float* -; CHECK-UNORDERED: [[LOAD6:%.*]] = load float, float* -; CHECK-UNORDERED: [[LOAD7:%.*]] = load float, float* -; CHECK-UNORDERED: [[FMULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD4]], float [[VEC_PHI]]) -; CHECK-UNORDERED: [[FMULADD1]] = tail call float @llvm.fmuladd.f32(float [[LOAD1]], float [[LOAD5]], float [[VEC_PHI1]]) -; CHECK-UNORDERED: [[FMULADD2]] = tail call float @llvm.fmuladd.f32(float [[LOAD2]], float [[LOAD6]], float [[VEC_PHI2]]) -; CHECK-UNORDERED: [[FMULADD3]] = tail call float @llvm.fmuladd.f32(float [[LOAD3]], float [[LOAD7]], float [[VEC_PHI3]]) -; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd float [[FMULADD1]], [[FMULADD]] -; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd float [[FMULADD2]], [[BIN_RDX]] -; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd float [[FMULADD3]], [[BIN_RDX1]] -; CHECK-UNORDERED: scalar.ph: -; CHECK-UNORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[BIN_RDX2]], %middle.block ] -; CHECK-UNORDERED: for.body: -; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ] -; CHECK-UNORDERED: [[LOAD8:%.*]] = load float, float* -; CHECK-UNORDERED: [[LOAD9:%.*]] = load float, float* -; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD8]], float [[LOAD9]], float [[SUM_07]]) -; CHECK-UNORDERED: for.end: -; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[BIN_RDX2]], %middle.block ] -; CHECK-UNORDERED: ret float [[RES]] - -; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_scalar_vf -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fmuladd_scalar_vf +; CHECK-NOT-VECTORIZED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]]) +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP11]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fmuladd_scalar_vf +; CHECK-UNORDERED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = load float, float* [[TMP4]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = load float, float* [[TMP5]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = load float, float* [[TMP6]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = load float, float* [[TMP7]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]] +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]] +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = load float, float* [[TMP12]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = load float, float* [[TMP13]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = load float, float* [[TMP14]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = load float, float* [[TMP15]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP20]] = tail call float @llvm.fmuladd.f32(float [[TMP8]], float [[TMP16]], float [[VEC_PHI]]) +; CHECK-UNORDERED-NEXT: [[TMP21]] = tail call float @llvm.fmuladd.f32(float [[TMP9]], float [[TMP17]], float [[VEC_PHI1]]) +; CHECK-UNORDERED-NEXT: [[TMP22]] = tail call float @llvm.fmuladd.f32(float [[TMP10]], float [[TMP18]], float [[VEC_PHI2]]) +; CHECK-UNORDERED-NEXT: [[TMP23]] = tail call float @llvm.fmuladd.f32(float [[TMP11]], float [[TMP19]], float [[VEC_PHI3]]) +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd float [[TMP21]], [[TMP20]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX4:%.*]] = fadd float [[TMP22]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX5:%.*]] = fadd float [[TMP23]], [[BIN_RDX4]] +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[BIN_RDX5]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP25]], float [[TMP26]], float [[SUM_07]]) +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[BIN_RDX5]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[MULADD_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fmuladd_scalar_vf +; CHECK-ORDERED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]] +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = load float, float* [[TMP4]], align 4 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = load float, float* [[TMP5]], align 4 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = load float, float* [[TMP6]], align 4 +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = load float, float* [[TMP7]], align 4 +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]] +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]] +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = load float, float* [[TMP12]], align 4 +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = load float, float* [[TMP13]], align 4 +; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = load float, float* [[TMP14]], align 4 +; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = load float, float* [[TMP15]], align 4 +; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = fmul float [[TMP8]], [[TMP16]] +; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = fmul float [[TMP9]], [[TMP17]] +; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = fmul float [[TMP10]], [[TMP18]] +; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = fmul float [[TMP11]], [[TMP19]] +; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = fadd float [[VEC_PHI]], [[TMP20]] +; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = fadd float [[TMP24]], [[TMP21]] +; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = fadd float [[TMP25]], [[TMP22]] +; CHECK-ORDERED-NEXT: [[TMP27]] = fadd float [[TMP26]], [[TMP23]] +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP29]], float [[TMP30]], float [[SUM_07]]) +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[MULADD_LCSSA]] +; + + entry: br label %for.body @@ -1183,14 +2888,65 @@ ; Test case where the reduction phi is one of the mul operands of the fmuladd. define float @fmuladd_phi_is_mul_operand(float* %a, float* %b, i64 %n) { -; CHECK-ORDERED-LABEL: @fmuladd_phi_is_mul_operand -; CHECK-ORDERED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fmuladd_phi_is_mul_operand +; CHECK-NOT-VECTORIZED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[TMP1]]) +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fmuladd_phi_is_mul_operand +; CHECK-UNORDERED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[TMP1]]) +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: ret float [[MULADD_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fmuladd_phi_is_mul_operand +; CHECK-ORDERED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[TMP1]]) +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: ret float [[MULADD_LCSSA]] +; -; CHECK-UNORDERED-LABEL: @fmuladd_phi_is_mul_operand -; CHECK-UNORDERED-NOT: vector.body -; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_phi_is_mul_operand -; CHECK-NOT-VECTORIZED-NOT: vector.body entry: br label %for.body @@ -1213,14 +2969,59 @@ ; Test case where the reduction phi is two operands of the fmuladd. define float @fmuladd_phi_is_two_operands(float* %a, i64 %n) { -; CHECK-ORDERED-LABEL: @fmuladd_phi_is_two_operands -; CHECK-ORDERED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fmuladd_phi_is_two_operands +; CHECK-NOT-VECTORIZED-SAME: (float* [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[SUM_07]]) +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fmuladd_phi_is_two_operands +; CHECK-UNORDERED-SAME: (float* [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[SUM_07]]) +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: ret float [[MULADD_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fmuladd_phi_is_two_operands +; CHECK-ORDERED-SAME: (float* [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[SUM_07]]) +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: ret float [[MULADD_LCSSA]] +; -; CHECK-UNORDERED-LABEL: @fmuladd_phi_is_two_operands -; CHECK-UNORDERED-NOT: vector.body -; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_phi_is_two_operands -; CHECK-NOT-VECTORIZED-NOT: vector.body entry: br label %for.body @@ -1242,37 +3043,137 @@ ; Test case with multiple calls to llvm.fmuladd, which is not safe to reorder ; so is only vectorized in the unordered (fast) case. define float @fmuladd_multiple(float* %a, float* %b, i64 %n) { -; CHECK-ORDERED-LABEL: @fmuladd_multiple -; CHECK-ORDERED-NOT: vector.body: - -; CHECK-UNORDERED-LABEL: @fmuladd_multiple -; CHECK-UNORDERED: vector.body: -; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <8 x float> [ , %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ] -; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>* -; CHECK-UNORDERED: [[FMULADD:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD4]], <8 x float> [[VEC_PHI]]) -; CHECK-UNORDERED: [[FMULADD2]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD4]], <8 x float> [[FMULADD]]) -; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <8 x float> -; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <8 x float> -; CHECK-UNORDERED: [[BIN_RDX3:%.*]] = fadd <8 x float> -; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX3]]) -; CHECK-UNORDERED: for.body: -; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ {{.*}}, %scalar.ph ], [ [[MULADD2:%.*]], %for.body ] -; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float* -; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, float* -; CHECK-UNORDERED: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD2]], float [[SUM_07]]) -; CHECK-UNORDERED: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD2]], float [[MULADD]]) -; CHECK-UNORDERED: for.end: -; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD2]], %for.body ], [ [[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float [[RES]] - -; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_multiple -; CHECK-NOT-VECTORIZED-NOT: vector.body: +; CHECK-NOT-VECTORIZED-LABEL: define float @fmuladd_multiple +; CHECK-NOT-VECTORIZED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]]) +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[MULADD]]) +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD2_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fmuladd_multiple +; CHECK-UNORDERED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP32:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16 +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24 +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]] +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]] +; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x float>, <8 x float>* [[TMP21]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 8 +; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP22]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x float>, <8 x float>* [[TMP23]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 16 +; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD9:%.*]] = load <8 x float>, <8 x float>* [[TMP25]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 24 +; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x float>, <8 x float>* [[TMP27]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD7]], <8 x float> [[VEC_PHI]]) +; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD4]], <8 x float> [[WIDE_LOAD8]], <8 x float> [[VEC_PHI1]]) +; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD5]], <8 x float> [[WIDE_LOAD9]], <8 x float> [[VEC_PHI2]]) +; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD6]], <8 x float> [[WIDE_LOAD10]], <8 x float> [[VEC_PHI3]]) +; CHECK-UNORDERED-NEXT: [[TMP32]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD7]], <8 x float> [[TMP28]]) +; CHECK-UNORDERED-NEXT: [[TMP33]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD4]], <8 x float> [[WIDE_LOAD8]], <8 x float> [[TMP29]]) +; CHECK-UNORDERED-NEXT: [[TMP34]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD5]], <8 x float> [[WIDE_LOAD9]], <8 x float> [[TMP30]]) +; CHECK-UNORDERED-NEXT: [[TMP35]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD6]], <8 x float> [[WIDE_LOAD10]], <8 x float> [[TMP31]]) +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-UNORDERED-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd <8 x float> [[TMP33]], [[TMP32]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX11:%.*]] = fadd <8 x float> [[TMP34]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX12:%.*]] = fadd <8 x float> [[TMP35]], [[BIN_RDX11]] +; CHECK-UNORDERED-NEXT: [[TMP37:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX12]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP37]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP38:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP39:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP38]], float [[TMP39]], float [[SUM_07]]) +; CHECK-UNORDERED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP38]], float [[TMP39]], float [[MULADD]]) +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ], [ [[TMP37]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[MULADD2_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fmuladd_multiple +; CHECK-ORDERED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]]) +; CHECK-ORDERED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[MULADD]]) +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: ret float [[MULADD2_LCSSA]] +; + + entry: br label %for.body @@ -1296,14 +3197,68 @@ ; Same as above but the first fmuladd is one of the mul operands of the second fmuladd. define float @multiple_fmuladds_mul_operand(float* %a, float* %b, i64 %n) { -; CHECK-ORDERED-LABEL: @multiple_fmuladds_mul_operand -; CHECK-ORDERED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @multiple_fmuladds_mul_operand +; CHECK-NOT-VECTORIZED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]]) +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[TMP1]]) +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD2_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @multiple_fmuladds_mul_operand +; CHECK-UNORDERED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]]) +; CHECK-UNORDERED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[TMP1]]) +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: ret float [[MULADD2_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @multiple_fmuladds_mul_operand +; CHECK-ORDERED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]]) +; CHECK-ORDERED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[TMP1]]) +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: ret float [[MULADD2_LCSSA]] +; -; CHECK-UNORDERED-LABEL: @multiple_fmuladds_mul_operand -; CHECK-UNORDERED-NOT: vector.body -; CHECK-NOT-VECTORIZED-LABEL: @multiple_fmuladds_mul_operand -; CHECK-NOT-VECTORIZED-NOT: vector.body entry: br label %for.body @@ -1327,14 +3282,68 @@ ; Same as above but the first fmuladd is two of the operands of the second fmuladd. define float @multiple_fmuladds_two_operands(float* %a, float* %b, i64 %n) { -; CHECK-ORDERED-LABEL: @multiple_fmuladds_two_operands -; CHECK-ORDERED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @multiple_fmuladds_two_operands +; CHECK-NOT-VECTORIZED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]]) +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[MULADD]]) +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD2_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @multiple_fmuladds_two_operands +; CHECK-UNORDERED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]]) +; CHECK-UNORDERED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[MULADD]]) +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: ret float [[MULADD2_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @multiple_fmuladds_two_operands +; CHECK-ORDERED-SAME: (float* [[A:%.*]], float* [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]]) +; CHECK-ORDERED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[MULADD]]) +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: ret float [[MULADD2_LCSSA]] +; -; CHECK-UNORDERED-LABEL: @multiple_fmuladds_two_operands -; CHECK-UNORDERED-NOT: vector.body -; CHECK-NOT-VECTORIZED-LABEL: @multiple_fmuladds_two_operands -; CHECK-NOT-VECTORIZED-NOT: vector.body entry: br label %for.body @@ -1360,38 +3369,132 @@ ; Test case with invariant store where fadd is strict. define void @reduction_store_to_invariant_address(float* %dst, float* readonly %src) { -; CHECK-ORDERED-LABEL: @reduction_store_to_invariant_address( -; CHECK-ORDERED: entry -; CHECK-ORDERED: %[[DEST_PTR:.*]] = getelementptr inbounds float, float* %dst, i64 42 -; CHECK-ORDERED: vector.body -; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ] -; CHECK-ORDERED: %[[LOAD_VEC:.*]] = load <8 x float>, <8 x float>* -; CHECK-ORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI]], <8 x float> %[[LOAD_VEC]]) -; CHECK-ORDERED: middle.block -; CHECK-ORDERED: store float %[[RDX]], float* %[[DEST_PTR]] -; CHECK-ORDERED: for.body -; CHECK-ORDERED: %[[LOAD:.*]] = load float, float* -; CHECK-ORDERED: %[[FADD:.*]] = fadd float %{{.*}}, %[[LOAD]] -; CHECK-ORDERED: store float %[[FADD]], float* %[[DEST_PTR]] - -; CHECK-UNORDERED-LABEL: @reduction_store_to_invariant_address( -; CHECK-UNORDERED: entry -; CHECK-UNORDERED: %[[DEST_PTR:.*]] = getelementptr inbounds float, float* %dst, i64 42 -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[FADD_VEC:.*]], %vector.body ] -; CHECK-UNORDERED: %[[LOAD_VEC:.*]] = load <8 x float>, <8 x float>* -; CHECK-UNORDERED: %[[FADD_VEC]] = fadd <8 x float> %[[VEC_PHI]], %[[LOAD_VEC]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[FADD_VEC]]) -; CHECK-UNORDERED: store float %[[RDX]], float* %[[DEST_PTR]] -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[LOAD:.*]] = load float, float* -; CHECK-UNORDERED: %[[FADD:.*]] = fadd float {{.*}}, %[[LOAD]] -; CHECK-UNORDERED: store float %[[FADD]], float* %[[DEST_PTR]] - -; CHECK-NOT-VECTORIZED-LABEL: @reduction_store_to_invariant_address( -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define void @reduction_store_to_invariant_address +; CHECK-NOT-VECTORIZED-SAME: (float* [[DST:%.*]], float* readonly [[SRC:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[DST]], i64 42 +; CHECK-NOT-VECTORIZED-NEXT: store float 0.000000e+00, float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 [[INDVARS_IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX1]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd float [[TMP0]], [[TMP1]] +; CHECK-NOT-VECTORIZED-NEXT: store float [[ADD]], float* [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000 +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; CHECK-NOT-VECTORIZED: for.cond.cleanup: +; CHECK-NOT-VECTORIZED-NEXT: ret void +; +; CHECK-UNORDERED-LABEL: define void @reduction_store_to_invariant_address +; CHECK-UNORDERED-SAME: (float* [[DST:%.*]], float* readonly [[SRC:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[SRC4:%.*]] = bitcast float* [[SRC]] to i8* +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[DST]], i64 42 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = bitcast float* [[ARRAYIDX]] to i8* +; CHECK-UNORDERED-NEXT: store float 0.000000e+00, float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-UNORDERED: vector.memcheck: +; CHECK-UNORDERED-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[DST]], i64 43 +; CHECK-UNORDERED-NEXT: [[SCEVGEP3:%.*]] = bitcast float* [[SCEVGEP]] to i8* +; CHECK-UNORDERED-NEXT: [[SCEVGEP5:%.*]] = getelementptr float, float* [[SRC]], i64 1000 +; CHECK-UNORDERED-NEXT: [[SCEVGEP56:%.*]] = bitcast float* [[SCEVGEP5]] to i8* +; CHECK-UNORDERED-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[ARRAYIDX2]], [[SCEVGEP56]] +; CHECK-UNORDERED-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SRC4]], [[SCEVGEP3]] +; CHECK-UNORDERED-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-UNORDERED-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !alias.scope !40 +; CHECK-UNORDERED-NEXT: [[TMP4]] = fadd <8 x float> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-UNORDERED-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP4]]) +; CHECK-UNORDERED-NEXT: store float [[TMP6]], float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[VECTOR_MEMCHECK]] ], [ 0.000000e+00, [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 [[INDVARS_IV]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX1]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD]] = fadd float [[TMP7]], [[TMP8]] +; CHECK-UNORDERED-NEXT: store float [[ADD]], float* [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000 +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] +; CHECK-UNORDERED: for.cond.cleanup: +; CHECK-UNORDERED-NEXT: ret void +; +; CHECK-ORDERED-LABEL: define void @reduction_store_to_invariant_address +; CHECK-ORDERED-SAME: (float* [[DST:%.*]], float* readonly [[SRC:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[SRC4:%.*]] = bitcast float* [[SRC]] to i8* +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[DST]], i64 42 +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = bitcast float* [[ARRAYIDX]] to i8* +; CHECK-ORDERED-NEXT: store float 0.000000e+00, float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-ORDERED: vector.memcheck: +; CHECK-ORDERED-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[DST]], i64 43 +; CHECK-ORDERED-NEXT: [[SCEVGEP3:%.*]] = bitcast float* [[SCEVGEP]] to i8* +; CHECK-ORDERED-NEXT: [[SCEVGEP5:%.*]] = getelementptr float, float* [[SRC]], i64 1000 +; CHECK-ORDERED-NEXT: [[SCEVGEP56:%.*]] = bitcast float* [[SCEVGEP5]] to i8* +; CHECK-ORDERED-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[ARRAYIDX2]], [[SCEVGEP56]] +; CHECK-ORDERED-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SRC4]], [[SCEVGEP3]] +; CHECK-ORDERED-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-ORDERED-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !alias.scope !36 +; CHECK-ORDERED-NEXT: [[TMP4]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[WIDE_LOAD]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-ORDERED-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: store float [[TMP4]], float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[VECTOR_MEMCHECK]] ], [ 0.000000e+00, [[ENTRY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 [[INDVARS_IV]] +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX1]], align 4 +; CHECK-ORDERED-NEXT: [[ADD]] = fadd float [[TMP6]], [[TMP7]] +; CHECK-ORDERED-NEXT: store float [[ADD]], float* [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000 +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] +; CHECK-ORDERED: for.cond.cleanup: +; CHECK-ORDERED-NEXT: ret void +; + + entry: %arrayidx = getelementptr inbounds float, float* %dst, i64 42 Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -31,37 +31,31 @@ ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 512, [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 512, [[N_MOD_VF]] ; CHECK-NEXT: [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP3:%.*]] = shl [[TMP2]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[C:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i32 [[D:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[D:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, [[VEC_IND]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP6]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP7:%.*]] = or [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, [[TMP7]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP9:%.*]] = add nsw [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP10:%.*]] = mul nsw [[WIDE_MASKED_GATHER1]], [[BROADCAST_SPLAT3]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP9]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, [[TMP7]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP10]], [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP2]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw [[TMP3]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = mul nsw [[TMP4]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 -1 +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv8i32( [[TMP6]], [[TMP7]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -159,18 +153,19 @@ ; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i16.nxv4p0( [[TMP8]], i32 2, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) ; CHECK-NEXT: [[TMP9:%.*]] = sext [[WIDE_MASKED_GATHER]] to ; CHECK-NEXT: [[TMP10:%.*]] = add nsw [[BROADCAST_SPLAT]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP10]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP12:%.*]] = sext [[WIDE_MASKED_GATHER1]] to -; CHECK-NEXT: [[TMP13:%.*]] = mul nsw [[BROADCAST_SPLAT3]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, [[TMP7]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP13]], [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] +; CHECK-NEXT: [[TMP11:%.*]] = sext [[WIDE_MASKED_GATHER1]] to +; CHECK-NEXT: [[TMP12:%.*]] = mul nsw [[BROADCAST_SPLAT3]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[TMP7]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 -1 +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv8i32( [[TMP10]], [[TMP12]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -180,17 +175,17 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i16], ptr @AB_i16, i64 0, i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i16], ptr @AB_i16, i64 0, i64 [[TMP19]] -; CHECK-NEXT: [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2 -; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP18]] to i32 +; CHECK-NEXT: [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = or i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i16], ptr @AB_i16, i64 0, i64 [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 ; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[CONV]], [[C]] ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[INDVARS_IV]] ; CHECK-NEXT: store i32 [[ADD3]], ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[CONV6:%.*]] = sext i16 [[TMP20]] to i32 +; CHECK-NEXT: [[CONV6:%.*]] = sext i16 [[TMP21]] to i32 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV6]], [[D]] -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP19]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP20]] ; CHECK-NEXT: store i32 [[MUL]], ptr [[ARRAYIDX9]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022 @@ -259,31 +254,33 @@ ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[C:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i32 [[D:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[D:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, [[VEC_IND]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP6]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP7:%.*]] = or [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, [[TMP7]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP9:%.*]] = add nsw [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP10:%.*]] = trunc [[TMP9]] to -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i16], ptr @CD_i16, i64 0, [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0( [[TMP10]], [[TMP11]], i32 2, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP12:%.*]] = mul nsw [[WIDE_MASKED_GATHER1]], [[BROADCAST_SPLAT3]] -; CHECK-NEXT: [[TMP13:%.*]] = trunc [[TMP12]] to -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i16], ptr @CD_i16, i64 0, [[TMP7]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0( [[TMP13]], [[TMP14]], i32 2, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = or [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP10:%.*]] = add nsw [[TMP7]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP11:%.*]] = trunc [[TMP10]] to +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr @CD_i16, i64 0, [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0( [[TMP11]], [[TMP12]], i32 2, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP13:%.*]] = mul nsw [[TMP8]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP14:%.*]] = trunc [[TMP13]] to +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x i16], ptr @CD_i16, i64 0, [[TMP9]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0( [[TMP14]], [[TMP15]], i32 2, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -293,17 +290,17 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[TMP19]] -; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP18]], [[C]] +; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = or i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[C]] ; CHECK-NEXT: [[CONV:%.*]] = trunc i32 [[ADD3]] to i16 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i16], ptr @CD_i16, i64 0, i64 [[INDVARS_IV]] ; CHECK-NEXT: store i16 [[CONV]], ptr [[ARRAYIDX5]], align 2 -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP20]], [[D]] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP21]], [[D]] ; CHECK-NEXT: [[CONV6:%.*]] = trunc i32 [[MUL]] to i16 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1024 x i16], ptr @CD_i16, i64 0, i64 [[TMP19]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1024 x i16], ptr @CD_i16, i64 0, i64 [[TMP20]] ; CHECK-NEXT: store i16 [[CONV6]], ptr [[ARRAYIDX9]], align 2 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022 @@ -485,40 +482,47 @@ ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[IND_END:%.*]] = add nsw i64 [[N_MOD_VF]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[INDUCTION:%.*]] = sub shufflevector ( insertelement ( poison, i64 1023, i64 0), poison, zeroinitializer), [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP3]], -4 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[DOTNEG]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv4i32() -; CHECK-NEXT: [[INDUCTION1:%.*]] = sub shufflevector ( insertelement ( poison, i32 1023, i64 0), poison, zeroinitializer), [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[DOTNEG7:%.*]] = mul nsw i32 [[TMP5]], -4 -; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement poison, i32 [[DOTNEG7]], i64 0 -; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector [[DOTSPLATINSERT2]], poison, zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i32() +; CHECK-NEXT: [[INDUCTION:%.*]] = sub shufflevector ( insertelement ( poison, i32 1023, i64 0), poison, zeroinitializer), [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i32 [[TMP3]], -4 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[DOTNEG]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND4:%.*]] = phi [ [[INDUCTION1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], [[VEC_IND]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP6]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP7:%.*]] = add nsw [[WIDE_MASKED_GATHER]], [[VEC_IND4]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[A]], [[VEC_IND]], i32 1 -; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP9:%.*]] = sub nsw [[WIDE_MASKED_GATHER6]], [[VEC_IND4]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], [[VEC_IND]], i32 0 -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP7]], [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B]], [[VEC_IND]], i32 1 -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP9]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[VEC_IND_NEXT5]] = add [[VEC_IND4]], [[DOTSPLAT3]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i32 [[TMP5]], 3 +; CHECK-NEXT: [[TMP7:%.*]] = sub nsw i32 2, [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP8]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP9]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vector.reverse.nxv4i32( [[TMP10]]) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[REVERSE1:%.*]] = call @llvm.experimental.vector.reverse.nxv4i32( [[TMP11]]) +; CHECK-NEXT: [[TMP12:%.*]] = add nsw [[REVERSE]], [[VEC_IND]] +; CHECK-NEXT: [[TMP13:%.*]] = sub nsw [[REVERSE1]], [[VEC_IND]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP15]], 3 +; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 1, [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]] +; CHECK-NEXT: [[REVERSE2:%.*]] = call @llvm.experimental.vector.reverse.nxv4i32( [[TMP12]]) +; CHECK-NEXT: [[REVERSE3:%.*]] = call @llvm.experimental.vector.reverse.nxv4i32( [[TMP13]]) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv8i32( [[REVERSE2]], [[REVERSE3]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP19]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = shl nuw nsw i64 [[TMP20]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -591,28 +595,23 @@ ; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 [[N_MOD_VF]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 512, [[TMP3]] ; CHECK-NEXT: [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP5:%.*]] = shl [[TMP4]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 3 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = and i64 [[INDEX]], 9223372036854775804 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], [[VEC_IND]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP9:%.*]] = shl nsw [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: store [[TMP9]], ptr [[TMP10]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP4]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = shl nsw [[TMP5]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP7:%.*]] = and i64 [[INDEX]], 9223372036854775804 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP7]] +; CHECK-NEXT: store [[TMP6]], ptr [[TMP8]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -681,28 +680,23 @@ ; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i64 [[TMP6]], i64 [[N_MOD_VF]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP8]] ; CHECK-NEXT: [[IND_END:%.*]] = shl i64 [[N_VEC]], 1 -; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP10:%.*]] = shl [[TMP9]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 3 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP10]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = and i64 [[INDEX]], 9223372036854775804 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], [[VEC_IND]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP13]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP14:%.*]] = shl nsw [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: store [[TMP14]], ptr [[TMP15]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP9]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = shl nsw [[TMP10]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP12:%.*]] = and i64 [[INDEX]], 9223372036854775804 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP12]] +; CHECK-NEXT: store [[TMP11]], ptr [[TMP13]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -849,35 +843,30 @@ ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 512, [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 512, [[N_MOD_VF]] ; CHECK-NEXT: [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP3:%.*]] = shl [[TMP2]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], [[VEC_IND]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP6]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP7:%.*]] = or [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], [[TMP7]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP9:%.*]] = mul nsw [[WIDE_MASKED_GATHER1]], [[WIDE_MASKED_GATHER]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP9]], [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP6]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP11:%.*]] = add nsw [[WIDE_MASKED_GATHER3]], [[WIDE_MASKED_GATHER2]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], [[TMP7]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP11]], [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP2]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = mul nsw [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = add nsw [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 -1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i64 [[TMP5]] +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv8i32( [[TMP6]], [[TMP9]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP11]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -968,49 +957,44 @@ ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP4]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP2]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, float undef, i32 0), [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, float undef, i32 0), [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ insertelement ( zeroinitializer, i32 undef, i32 0), [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[P:%.*]], [[VEC_IND]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP5]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP6]] = add [[WIDE_MASKED_GATHER]], [[VEC_PHI1]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT]], ptr [[P]], [[VEC_IND]], i32 1 -; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0( [[TMP7]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP8]] = fadd fast [[VEC_PHI]], [[WIDE_MASKED_GATHER2]] -; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP2]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +; CHECK-NEXT: [[TMP6]] = add [[TMP3]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP7]] = fadd fast [[VEC_PHI]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP6]]) -; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP8]]) +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP6]]) +; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP7]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ undef, [[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX3:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ undef, [[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ undef, [[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i32 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ undef, [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3:%.*]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3:%.*]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: store i32 [[ADD_LCSSA]], ptr @SA, align 4 ; CHECK-NEXT: store float [[ADD3_LCSSA]], ptr @SB, align 4 ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[SUMB_014:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3]], [[FOR_BODY]] ] -; CHECK-NEXT: [[SUMA_013:%.*]] = phi i32 [ [[BC_MERGE_RDX3]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUMA_013:%.*]] = phi i32 [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ] ; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT]], ptr [[P]], i64 [[INDVARS_IV]], i32 0 ; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[A]], align 4 ; CHECK-NEXT: [[ADD]] = add nsw i32 [[LOAD1]], [[SUMA_013]] @@ -1091,14 +1075,17 @@ ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], [[VEC_IND]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], [[VEC_IND]], i32 1 ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_GATHER]], [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement [[TMP9]], i64 0 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP11]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP12]], [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -1169,39 +1156,44 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], [[VEC_IND]], i32 0 +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], [[VEC_IND]], i32 1 -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_GATHER]], [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP11]] = add [[WIDE_MASKED_GATHER1]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP9]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP11]], [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP12:%.*]] = extractelement [[TMP10]], i64 0 +; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load , ptr [[TMP12]], align 4 +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC1]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; CHECK-NEXT: [[TMP14]] = add [[TMP13]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP11]]) +; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP17:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP20:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 0 ; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1 -; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[P_I_X]], align 4 -; CHECK-NEXT: store i32 [[TMP16]], ptr [[P_I_Y]], align 4 -; CHECK-NEXT: [[TMP17]] = add nsw i32 [[TMP16]], [[S]] +; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[P_I_X]], align 4 +; CHECK-NEXT: store i32 [[TMP19]], ptr [[P_I_Y]], align 4 +; CHECK-NEXT: [[TMP20]] = add nsw i32 [[TMP19]], [[S]] ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: ret i32 [[TMP17]] +; CHECK-NEXT: ret i32 [[TMP20]] ; entry: br label %for.body @@ -1261,17 +1253,19 @@ ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], [[VEC_IND]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], shufflevector ( insertelement ( poison, i64 -1, i64 0), poison, zeroinitializer), i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 -1, i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], [[VEC_IND]], i32 1 ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_GATHER]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP10]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP12]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -1283,8 +1277,8 @@ ; CHECK-NEXT: [[P_I_MINUS_1_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 -1, i32 0 ; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1 ; CHECK-NEXT: store i32 [[Z]], ptr [[P_I_X]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[P_I_MINUS_1_X]], align 4 -; CHECK-NEXT: store i32 [[TMP15]], ptr [[P_I_Y]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[P_I_MINUS_1_X]], align 4 +; CHECK-NEXT: store i32 [[TMP16]], ptr [[P_I_Y]], align 4 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP27:![0-9]+]] @@ -1345,44 +1339,48 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], [[VEC_IND]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], [[VEC_IND]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], [[TMP9]], i32 1 -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_GATHER]], [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP13]] = add [[WIDE_MASKED_GATHER1]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP10]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP13]], [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load , ptr [[TMP11]], align 4 +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC1]]) +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; CHECK-NEXT: [[TMP15]] = add [[TMP14]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP13]]) +; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP15]]) ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP20:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP22:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[I_PLUS_1:%.*]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 0 ; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1 ; CHECK-NEXT: [[P_I_PLUS_1_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I_PLUS_1]], i32 1 -; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[P_I_X]], align 4 -; CHECK-NEXT: store i32 [[TMP18]], ptr [[P_I_PLUS_1_Y]], align 4 -; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[P_I_Y]], align 4 -; CHECK-NEXT: [[TMP20]] = add nsw i32 [[TMP19]], [[S]] +; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[P_I_X]], align 4 +; CHECK-NEXT: store i32 [[TMP20]], ptr [[P_I_PLUS_1_Y]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[P_I_Y]], align 4 +; CHECK-NEXT: [[TMP22]] = add nsw i32 [[TMP21]], [[S]] ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP29:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: ret i32 [[TMP20]] +; CHECK-NEXT: ret i32 [[TMP22]] ; entry: br label %for.body @@ -1452,18 +1450,20 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP8]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP11:%.*]] = add [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[OFFSET_IDX]], 1 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], [[VEC_IND]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[A]], i64 -1 ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[BROADCAST_SPLAT2]], [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[BROADCAST_SPLAT4]], [[TMP13]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP11]] +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv8i32( [[BROADCAST_SPLAT2]], [[BROADCAST_SPLAT4]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP14]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -30,48 +30,43 @@ ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP4]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[C]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2 -; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP5]], 6 -; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[VECTOR_GEP:%.*]] = shl [[TMP8]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[VECTOR_GEP]] -; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement poison, i64 [[TMP6]], i64 0 -; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector [[DOTSPLATINSERT2]], poison, zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP11:%.*]] = add [[DOTSPLAT3]], [[TMP10]] -; CHECK-NEXT: [[VECTOR_GEP4:%.*]] = shl [[TMP11]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[VECTOR_GEP4]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, [[TMP9]], i64 1 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, [[TMP12]], i64 1 -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP13]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP15:%.*]] = add nsw [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP16:%.*]] = add nsw [[WIDE_MASKED_GATHER5]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: store [[TMP15]], ptr [[TMP17]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i64 [[TMP18]], 2 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP19]] -; CHECK-NEXT: store [[TMP16]], ptr [[TMP20]], align 4 -; CHECK-NEXT: [[TMP21:%.*]] = add nsw [[WIDE_MASKED_GATHER6]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP22:%.*]] = add nsw [[WIDE_MASKED_GATHER7]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: store [[TMP21]], ptr [[TMP23]], align 4 -; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP25:%.*]] = shl nuw nsw i64 [[TMP24]], 2 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP25]] -; CHECK-NEXT: store [[TMP22]], ptr [[TMP26]], align 4 -; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP28:%.*]] = shl nuw nsw i64 [[TMP27]], 3 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP28]] -; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[INDEX]], 3 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[C]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 5 +; CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[C]], i64 [[TMP9]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[NEXT_GEP]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load , ptr [[NEXT_GEP2]], align 4 +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC3]]) +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = add nsw [[TMP10]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP15:%.*]] = add nsw [[TMP11]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: store [[TMP14]], ptr [[TMP16]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = shl nuw nsw i64 [[TMP17]], 2 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP18]] +; CHECK-NEXT: store [[TMP15]], ptr [[TMP19]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = add nsw [[TMP12]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP21:%.*]] = add nsw [[TMP13]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: store [[TMP20]], ptr [[TMP22]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP24:%.*]] = shl nuw nsw i64 [[TMP23]], 2 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP24]] +; CHECK-NEXT: store [[TMP21]], ptr [[TMP25]], align 4 +; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP27:%.*]] = shl nuw nsw i64 [[TMP26]], 3 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP27]] +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] @@ -83,13 +78,13 @@ ; CHECK-NEXT: [[PTR_014:%.*]] = phi ptr [ [[INCDEC_PTR1:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[PTR_014]], i64 1 -; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[PTR_014]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[PTR_014]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1]] = getelementptr inbounds i32, ptr [[PTR_014]], i64 2 -; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP30]], 1 +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP29]], 1 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_013]] ; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP31]], 1 +; CHECK-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP30]], 1 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_013]] ; CHECK-NEXT: store i32 [[ADD2]], ptr [[ARRAYIDX3]], align 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_013]], 1 Index: llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll +++ llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll @@ -152,13 +152,17 @@ ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> , [[STRIDED_VEC]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP2]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 -; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[TMP0]] poison, <4 x i32> [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = insertvalue [[TMP0]] [[TMP7]], <4 x i32> [[STRIDED_VEC1]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue [[TMP0]] [[TMP8]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> , [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP2]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP10]], ptr [[TMP12]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -169,8 +173,8 @@ ; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], 2 ; CHECK-NEXT: [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i32 [[ADD5]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP14]] ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_023]] ; CHECK-NEXT: store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_023]], 1 Index: llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll @@ -15,18 +15,22 @@ ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[TMP2]], i32 0 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = insertvalue [[TMP0]] poison, <8 x i32> [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC1]], -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP6]], i32 -1 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[TMP0]] [[TMP4]], <8 x i32> [[STRIDED_VEC1]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue [[TMP0]] [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue [[TMP0]] [[TMP5]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = add <8 x i32> [[TMP6]], +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = add <8 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 -1 +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP11]], <16 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP13]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP12]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -89,18 +93,22 @@ ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = insertvalue [[TMP1]] poison, <4 x i64> [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[TMP6]], i32 -1 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> -; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[TMP1]] [[TMP4]], <4 x i64> [[STRIDED_VEC1]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue [[TMP1]] [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue [[TMP1]] [[TMP5]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i64> [[TMP6]], +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i64> [[TMP7]], +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP10]], i32 -1 +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> [[TMP11]], <8 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP13]], <8 x i64> poison, <8 x i32> +; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP12]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -163,23 +171,29 @@ ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[TMP2]], i32 0 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <6 x i32>, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <6 x i32> [[WIDE_VEC]], <6 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = insertvalue [[TMP2]] poison, <2 x i32> [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <6 x i32> [[WIDE_VEC]], <6 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[TMP2]] [[TMP4]], <2 x i32> [[STRIDED_VEC1]], 1 ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <6 x i32> [[WIDE_VEC]], <6 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> [[STRIDED_VEC]], -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i32> [[STRIDED_VEC1]], -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP5]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[STRIDED_VEC2]], -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 -2 -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <6 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <6 x i32> [[TMP13]], <6 x i32> poison, <6 x i32> -; CHECK-NEXT: store <6 x i32> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertvalue [[TMP2]] [[TMP5]], <2 x i32> [[STRIDED_VEC2]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue [[TMP2]] [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue [[TMP2]] [[TMP6]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue [[TMP2]] [[TMP6]], 2 +; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = add <2 x i32> [[TMP8]], +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP11]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = add <2 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP14]], i32 -2 +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> [[TMP18]], <6 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <6 x i32> [[TMP19]], <6 x i32> poison, <6 x i32> +; CHECK-NEXT: store <6 x i32> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -425,22 +439,30 @@ ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP6]], align 4 -; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]] -; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], [[STRIDED_VEC4]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[Q]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 -; CHECK-NEXT: store <8 x i32> [[TMP8]], ptr [[TMP12]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 8 -; CHECK-NEXT: store <8 x i32> [[TMP9]], ptr [[TMP13]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = insertvalue [[TMP3]] poison, <8 x i32> [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertvalue [[TMP3]] [[TMP8]], <8 x i32> [[STRIDED_VEC1]], 1 +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4 +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = insertvalue [[TMP4]] poison, <8 x i32> [[STRIDED_VEC3]], 0 +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertvalue [[TMP4]] [[TMP10]], <8 x i32> [[STRIDED_VEC4]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue [[TMP3]] [[TMP9]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue [[TMP4]] [[TMP11]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue [[TMP3]] [[TMP9]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue [[TMP4]] [[TMP11]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = add <8 x i32> [[TMP12]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = add <8 x i32> [[TMP13]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[Q]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP18]], i32 0 +; CHECK-NEXT: store <8 x i32> [[TMP16]], ptr [[TMP20]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[TMP18]], i32 8 +; CHECK-NEXT: store <8 x i32> [[TMP17]], ptr [[TMP21]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -506,22 +528,30 @@ ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP6]], align 4 -; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC3]] -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i64, ptr [[Q]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0 -; CHECK-NEXT: store <4 x i64> [[TMP8]], ptr [[TMP12]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP10]], i32 4 -; CHECK-NEXT: store <4 x i64> [[TMP9]], ptr [[TMP13]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = insertvalue [[TMP5]] poison, <4 x i64> [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertvalue [[TMP5]] [[TMP8]], <4 x i64> [[STRIDED_VEC1]], 1 +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i64>, ptr [[TMP7]], align 4 +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = insertvalue [[TMP6]] poison, <4 x i64> [[STRIDED_VEC3]], 0 +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertvalue [[TMP6]] [[TMP10]], <4 x i64> [[STRIDED_VEC4]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue [[TMP5]] [[TMP9]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue [[TMP6]] [[TMP11]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue [[TMP5]] [[TMP9]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue [[TMP6]] [[TMP11]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i64> [[TMP12]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = add <4 x i64> [[TMP13]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[Q]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i64, ptr [[TMP18]], i32 0 +; CHECK-NEXT: store <4 x i64> [[TMP16]], ptr [[TMP20]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i64, ptr [[TMP18]], i32 4 +; CHECK-NEXT: store <4 x i64> [[TMP17]], ptr [[TMP21]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] Index: llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll @@ -169,12 +169,16 @@ ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], -; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP3]], <8 x ptr> [[TMP0]], i32 4, <8 x i1> ) +; CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[TMP0]] poison, <8 x i32> [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = insertvalue [[TMP0]] [[TMP3]], <8 x i32> [[STRIDED_VEC2]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue [[TMP0]] [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i32> [[TMP5]], +; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP6]], <8 x ptr> [[TMP0]], i32 4, <8 x i1> ) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 64 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1016 -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1016 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: Index: llvm/test/Transforms/LoopVectorize/X86/interleaving.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/interleaving.ll +++ llvm/test/Transforms/LoopVectorize/X86/interleaving.ll @@ -19,20 +19,20 @@ ; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]] ; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP2]] ; SSE-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4 -; SSE-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4 ; SSE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; SSE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> -; SSE-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; SSE-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> -; SSE-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]] -; SSE-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]] -; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] -; SSE-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP9]], align 4 -; SSE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 4 -; SSE-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP11]], align 4 +; SSE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4 +; SSE-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] +; SSE-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]] +; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] +; SSE-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4 +; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 4 +; SSE-NEXT: store <4 x i32> [[TMP6]], ptr [[TMP8]], align 4 ; SSE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; SSE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; SSE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SSE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; SSE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SSE: middle.block: ; SSE-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; SSE: scalar.ph: @@ -40,7 +40,7 @@ ; SSE: for.cond.cleanup: ; SSE-NEXT: ret void ; SSE: for.body: -; SSE-NEXT: br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; SSE-NEXT: br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; ; AVX1-LABEL: @foo( ; AVX1-NEXT: entry: @@ -61,32 +61,32 @@ ; AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP4]] ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP6]] ; AVX1-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4 -; AVX1-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4 -; AVX1-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4 -; AVX1-NEXT: [[WIDE_VEC3:%.*]] = load <8 x i32>, ptr [[TMP10]], align 4 ; AVX1-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <8 x i32> [[WIDE_VEC3]], <8 x i32> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x i32> [[WIDE_VEC3]], <8 x i32> poison, <4 x i32> -; AVX1-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]] -; AVX1-NEXT: [[TMP16:%.*]] = add nsw <4 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]] -; AVX1-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]] -; AVX1-NEXT: [[TMP18:%.*]] = add nsw <4 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]] -; AVX1-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] -; AVX1-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP19]], align 4 -; AVX1-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 4 -; AVX1-NEXT: store <4 x i32> [[TMP16]], ptr [[TMP21]], align 4 -; AVX1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 8 -; AVX1-NEXT: store <4 x i32> [[TMP17]], ptr [[TMP23]], align 4 -; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 12 -; AVX1-NEXT: store <4 x i32> [[TMP18]], ptr [[TMP25]], align 4 +; AVX1-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4 +; AVX1-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[WIDE_VEC5:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4 +; AVX1-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <8 x i32> [[WIDE_VEC5]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x i32> [[WIDE_VEC5]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[WIDE_VEC8:%.*]] = load <8 x i32>, ptr [[TMP10]], align 4 +; AVX1-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x i32> [[WIDE_VEC8]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x i32> [[WIDE_VEC8]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] +; AVX1-NEXT: [[TMP12:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]] +; AVX1-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC6]] +; AVX1-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC9]] +; AVX1-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] +; AVX1-NEXT: store <4 x i32> [[TMP11]], ptr [[TMP15]], align 4 +; AVX1-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 4 +; AVX1-NEXT: store <4 x i32> [[TMP12]], ptr [[TMP16]], align 4 +; AVX1-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 8 +; AVX1-NEXT: store <4 x i32> [[TMP13]], ptr [[TMP17]], align 4 +; AVX1-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 12 +; AVX1-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP18]], align 4 ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; AVX1-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; AVX1-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; AVX1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; AVX1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; AVX1: scalar.ph: @@ -94,7 +94,7 @@ ; AVX1: for.cond.cleanup: ; AVX1-NEXT: ret void ; AVX1: for.body: -; AVX1-NEXT: br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; AVX1-NEXT: br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; ; AVX2-LABEL: @foo( ; AVX2-NEXT: entry: @@ -115,32 +115,32 @@ ; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP4]] ; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP6]] ; AVX2-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4 -; AVX2-NEXT: [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP8]], align 4 -; AVX2-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP9]], align 4 -; AVX2-NEXT: [[WIDE_VEC3:%.*]] = load <16 x i32>, ptr [[TMP10]], align 4 ; AVX2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; AVX2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> -; AVX2-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> -; AVX2-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i32> [[WIDE_VEC3]], <16 x i32> poison, <8 x i32> -; AVX2-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; AVX2-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> -; AVX2-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> -; AVX2-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <16 x i32> [[WIDE_VEC3]], <16 x i32> poison, <8 x i32> -; AVX2-NEXT: [[TMP15:%.*]] = add nsw <8 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]] -; AVX2-NEXT: [[TMP16:%.*]] = add nsw <8 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]] -; AVX2-NEXT: [[TMP17:%.*]] = add nsw <8 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]] -; AVX2-NEXT: [[TMP18:%.*]] = add nsw <8 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]] -; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] -; AVX2-NEXT: store <8 x i32> [[TMP15]], ptr [[TMP19]], align 4 -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 8 -; AVX2-NEXT: store <8 x i32> [[TMP16]], ptr [[TMP21]], align 4 -; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 16 -; AVX2-NEXT: store <8 x i32> [[TMP17]], ptr [[TMP23]], align 4 -; AVX2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 24 -; AVX2-NEXT: store <8 x i32> [[TMP18]], ptr [[TMP25]], align 4 +; AVX2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> +; AVX2-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP8]], align 4 +; AVX2-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> +; AVX2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> +; AVX2-NEXT: [[WIDE_VEC5:%.*]] = load <16 x i32>, ptr [[TMP9]], align 4 +; AVX2-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i32> [[WIDE_VEC5]], <16 x i32> poison, <8 x i32> +; AVX2-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i32> [[WIDE_VEC5]], <16 x i32> poison, <8 x i32> +; AVX2-NEXT: [[WIDE_VEC8:%.*]] = load <16 x i32>, ptr [[TMP10]], align 4 +; AVX2-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <16 x i32> [[WIDE_VEC8]], <16 x i32> poison, <8 x i32> +; AVX2-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <16 x i32> [[WIDE_VEC8]], <16 x i32> poison, <8 x i32> +; AVX2-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] +; AVX2-NEXT: [[TMP12:%.*]] = add nsw <8 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]] +; AVX2-NEXT: [[TMP13:%.*]] = add nsw <8 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC6]] +; AVX2-NEXT: [[TMP14:%.*]] = add nsw <8 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC9]] +; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] +; AVX2-NEXT: store <8 x i32> [[TMP11]], ptr [[TMP15]], align 4 +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 8 +; AVX2-NEXT: store <8 x i32> [[TMP12]], ptr [[TMP16]], align 4 +; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 16 +; AVX2-NEXT: store <8 x i32> [[TMP13]], ptr [[TMP17]], align 4 +; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 24 +; AVX2-NEXT: store <8 x i32> [[TMP14]], ptr [[TMP18]], align 4 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; AVX2-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; AVX2-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; AVX2-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; AVX2-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; AVX2: scalar.ph: @@ -148,7 +148,7 @@ ; AVX2: for.cond.cleanup: ; AVX2-NEXT: ret void ; AVX2: for.body: -; AVX2-NEXT: br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; AVX2-NEXT: br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; ; ATOM-LABEL: @foo( ; ATOM-NEXT: entry: Index: llvm/test/Transforms/LoopVectorize/X86/pr47437.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/pr47437.ll +++ llvm/test/Transforms/LoopVectorize/X86/pr47437.ll @@ -26,25 +26,33 @@ ; SSE2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 0 ; SSE2-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 ; SSE2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> +; SSE2-NEXT: [[TMP4:%.*]] = insertvalue [[TMP0]] poison, <4 x i16> [[STRIDED_VEC]], 0 ; SSE2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32> -; SSE2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP1]] -; SSE2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 -; SSE2-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP6]], align 2 +; SSE2-NEXT: [[TMP5:%.*]] = insertvalue [[TMP0]] [[TMP4]], <4 x i16> [[STRIDED_VEC1]], 1 +; SSE2-NEXT: [[TMP6:%.*]] = extractvalue [[TMP0]] [[TMP5]], 0 +; SSE2-NEXT: [[TMP7:%.*]] = extractvalue [[TMP0]] [[TMP5]], 1 +; SSE2-NEXT: [[TMP8:%.*]] = sext <4 x i16> [[TMP6]] to <4 x i32> +; SSE2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP1]] +; SSE2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP9]], i32 0 +; SSE2-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP10]], align 2 ; SSE2-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> +; SSE2-NEXT: [[TMP11:%.*]] = insertvalue [[TMP1]] poison, <4 x i16> [[STRIDED_VEC3]], 0 ; SSE2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> -; SSE2-NEXT: [[TMP7:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32> -; SSE2-NEXT: [[TMP8:%.*]] = mul nsw <4 x i32> [[TMP7]], [[TMP4]] -; SSE2-NEXT: [[TMP9:%.*]] = sext <4 x i16> [[STRIDED_VEC1]] to <4 x i32> -; SSE2-NEXT: [[TMP10:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32> -; SSE2-NEXT: [[TMP11:%.*]] = mul nsw <4 x i32> [[TMP10]], [[TMP9]] -; SSE2-NEXT: [[TMP12:%.*]] = add nsw <4 x i32> [[TMP11]], [[TMP8]] -; SSE2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]] -; SSE2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 -; SSE2-NEXT: store <4 x i32> [[TMP12]], ptr [[TMP14]], align 4 +; SSE2-NEXT: [[TMP12:%.*]] = insertvalue [[TMP1]] [[TMP11]], <4 x i16> [[STRIDED_VEC4]], 1 +; SSE2-NEXT: [[TMP13:%.*]] = extractvalue [[TMP1]] [[TMP12]], 0 +; SSE2-NEXT: [[TMP14:%.*]] = extractvalue [[TMP1]] [[TMP12]], 1 +; SSE2-NEXT: [[TMP15:%.*]] = sext <4 x i16> [[TMP13]] to <4 x i32> +; SSE2-NEXT: [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP15]], [[TMP8]] +; SSE2-NEXT: [[TMP17:%.*]] = sext <4 x i16> [[TMP7]] to <4 x i32> +; SSE2-NEXT: [[TMP18:%.*]] = sext <4 x i16> [[TMP14]] to <4 x i32> +; SSE2-NEXT: [[TMP19:%.*]] = mul nsw <4 x i32> [[TMP18]], [[TMP17]] +; SSE2-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[TMP19]], [[TMP16]] +; SSE2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]] +; SSE2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0 +; SSE2-NEXT: store <4 x i32> [[TMP20]], ptr [[TMP22]], align 4 ; SSE2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; SSE2-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SSE2-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SSE2-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SSE2-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SSE2: middle.block: ; SSE2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; SSE2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -53,28 +61,28 @@ ; SSE2-NEXT: br label [[FOR_BODY:%.*]] ; SSE2: for.body: ; SSE2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; SSE2-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 -; SSE2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP16]] -; SSE2-NEXT: [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 -; SSE2-NEXT: [[CONV:%.*]] = sext i16 [[TMP17]] to i32 -; SSE2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP16]] -; SSE2-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 -; SSE2-NEXT: [[CONV5:%.*]] = sext i16 [[TMP18]] to i32 +; SSE2-NEXT: [[TMP24:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; SSE2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP24]] +; SSE2-NEXT: [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; SSE2-NEXT: [[CONV:%.*]] = sext i16 [[TMP25]] to i32 +; SSE2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP24]] +; SSE2-NEXT: [[TMP26:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 +; SSE2-NEXT: [[CONV5:%.*]] = sext i16 [[TMP26]] to i32 ; SSE2-NEXT: [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]] -; SSE2-NEXT: [[TMP19:%.*]] = or i64 [[TMP16]], 1 -; SSE2-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP19]] -; SSE2-NEXT: [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 -; SSE2-NEXT: [[CONV11:%.*]] = sext i16 [[TMP20]] to i32 -; SSE2-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP19]] -; SSE2-NEXT: [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2 -; SSE2-NEXT: [[CONV16:%.*]] = sext i16 [[TMP21]] to i32 +; SSE2-NEXT: [[TMP27:%.*]] = or i64 [[TMP24]], 1 +; SSE2-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP27]] +; SSE2-NEXT: [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 +; SSE2-NEXT: [[CONV11:%.*]] = sext i16 [[TMP28]] to i32 +; SSE2-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP27]] +; SSE2-NEXT: [[TMP29:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2 +; SSE2-NEXT: [[CONV16:%.*]] = sext i16 [[TMP29]] to i32 ; SSE2-NEXT: [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]] ; SSE2-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]] ; SSE2-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[INDVARS_IV]] ; SSE2-NEXT: store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4 ; SSE2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; SSE2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; SSE2-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; SSE2-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; SSE2: for.end.loopexit: ; SSE2-NEXT: br label [[FOR_END]] ; SSE2: for.end: @@ -103,44 +111,60 @@ ; SSE41-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0 ; SSE41-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 ; SSE41-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP6]], align 2 -; SSE41-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2 ; SSE41-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> -; SSE41-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> -; SSE41-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> -; SSE41-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> -; SSE41-NEXT: [[TMP8:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32> -; SSE41-NEXT: [[TMP9:%.*]] = sext <4 x i16> [[STRIDED_VEC2]] to <4 x i32> -; SSE41-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP2]] -; SSE41-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP3]] -; SSE41-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0 -; SSE41-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0 -; SSE41-NEXT: [[WIDE_VEC5:%.*]] = load <8 x i16>, ptr [[TMP12]], align 2 -; SSE41-NEXT: [[WIDE_VEC6:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2 -; SSE41-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x i16> [[WIDE_VEC5]], <8 x i16> poison, <4 x i32> -; SSE41-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> -; SSE41-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC5]], <8 x i16> poison, <4 x i32> -; SSE41-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> -; SSE41-NEXT: [[TMP14:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32> -; SSE41-NEXT: [[TMP15:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32> -; SSE41-NEXT: [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP14]], [[TMP8]] -; SSE41-NEXT: [[TMP17:%.*]] = mul nsw <4 x i32> [[TMP15]], [[TMP9]] -; SSE41-NEXT: [[TMP18:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32> -; SSE41-NEXT: [[TMP19:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32> -; SSE41-NEXT: [[TMP20:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32> -; SSE41-NEXT: [[TMP21:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32> -; SSE41-NEXT: [[TMP22:%.*]] = mul nsw <4 x i32> [[TMP20]], [[TMP18]] -; SSE41-NEXT: [[TMP23:%.*]] = mul nsw <4 x i32> [[TMP21]], [[TMP19]] -; SSE41-NEXT: [[TMP24:%.*]] = add nsw <4 x i32> [[TMP22]], [[TMP16]] -; SSE41-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[TMP23]], [[TMP17]] -; SSE41-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]] -; SSE41-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP1]] -; SSE41-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 0 -; SSE41-NEXT: store <4 x i32> [[TMP24]], ptr [[TMP28]], align 4 -; SSE41-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 4 -; SSE41-NEXT: store <4 x i32> [[TMP25]], ptr [[TMP29]], align 4 +; SSE41-NEXT: [[TMP8:%.*]] = insertvalue [[TMP0]] poison, <4 x i16> [[STRIDED_VEC]], 0 +; SSE41-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> +; SSE41-NEXT: [[TMP9:%.*]] = insertvalue [[TMP0]] [[TMP8]], <4 x i16> [[STRIDED_VEC1]], 1 +; SSE41-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2 +; SSE41-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> +; SSE41-NEXT: [[TMP10:%.*]] = insertvalue [[TMP1]] poison, <4 x i16> [[STRIDED_VEC3]], 0 +; SSE41-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> +; SSE41-NEXT: [[TMP11:%.*]] = insertvalue [[TMP1]] [[TMP10]], <4 x i16> [[STRIDED_VEC4]], 1 +; SSE41-NEXT: [[TMP12:%.*]] = extractvalue [[TMP0]] [[TMP9]], 0 +; SSE41-NEXT: [[TMP13:%.*]] = extractvalue [[TMP1]] [[TMP11]], 0 +; SSE41-NEXT: [[TMP14:%.*]] = extractvalue [[TMP0]] [[TMP9]], 1 +; SSE41-NEXT: [[TMP15:%.*]] = extractvalue [[TMP1]] [[TMP11]], 1 +; SSE41-NEXT: [[TMP16:%.*]] = sext <4 x i16> [[TMP12]] to <4 x i32> +; SSE41-NEXT: [[TMP17:%.*]] = sext <4 x i16> [[TMP13]] to <4 x i32> +; SSE41-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP2]] +; SSE41-NEXT: [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP3]] +; SSE41-NEXT: [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[TMP18]], i32 0 +; SSE41-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[TMP19]], i32 0 +; SSE41-NEXT: [[WIDE_VEC5:%.*]] = load <8 x i16>, ptr [[TMP20]], align 2 +; SSE41-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <8 x i16> [[WIDE_VEC5]], <8 x i16> poison, <4 x i32> +; SSE41-NEXT: [[TMP22:%.*]] = insertvalue [[TMP2]] poison, <4 x i16> [[STRIDED_VEC6]], 0 +; SSE41-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x i16> [[WIDE_VEC5]], <8 x i16> poison, <4 x i32> +; SSE41-NEXT: [[TMP23:%.*]] = insertvalue [[TMP2]] [[TMP22]], <4 x i16> [[STRIDED_VEC7]], 1 +; SSE41-NEXT: [[WIDE_VEC8:%.*]] = load <8 x i16>, ptr [[TMP21]], align 2 +; SSE41-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC8]], <8 x i16> poison, <4 x i32> +; SSE41-NEXT: [[TMP24:%.*]] = insertvalue [[TMP3]] poison, <4 x i16> [[STRIDED_VEC9]], 0 +; SSE41-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC8]], <8 x i16> poison, <4 x i32> +; SSE41-NEXT: [[TMP25:%.*]] = insertvalue [[TMP3]] [[TMP24]], <4 x i16> [[STRIDED_VEC10]], 1 +; SSE41-NEXT: [[TMP26:%.*]] = extractvalue [[TMP2]] [[TMP23]], 0 +; SSE41-NEXT: [[TMP27:%.*]] = extractvalue [[TMP3]] [[TMP25]], 0 +; SSE41-NEXT: [[TMP28:%.*]] = extractvalue [[TMP2]] [[TMP23]], 1 +; SSE41-NEXT: [[TMP29:%.*]] = extractvalue [[TMP3]] [[TMP25]], 1 +; SSE41-NEXT: [[TMP30:%.*]] = sext <4 x i16> [[TMP26]] to <4 x i32> +; SSE41-NEXT: [[TMP31:%.*]] = sext <4 x i16> [[TMP27]] to <4 x i32> +; SSE41-NEXT: [[TMP32:%.*]] = mul nsw <4 x i32> [[TMP30]], [[TMP16]] +; SSE41-NEXT: [[TMP33:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP17]] +; SSE41-NEXT: [[TMP34:%.*]] = sext <4 x i16> [[TMP14]] to <4 x i32> +; SSE41-NEXT: [[TMP35:%.*]] = sext <4 x i16> [[TMP15]] to <4 x i32> +; SSE41-NEXT: [[TMP36:%.*]] = sext <4 x i16> [[TMP28]] to <4 x i32> +; SSE41-NEXT: [[TMP37:%.*]] = sext <4 x i16> [[TMP29]] to <4 x i32> +; SSE41-NEXT: [[TMP38:%.*]] = mul nsw <4 x i32> [[TMP36]], [[TMP34]] +; SSE41-NEXT: [[TMP39:%.*]] = mul nsw <4 x i32> [[TMP37]], [[TMP35]] +; SSE41-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[TMP38]], [[TMP32]] +; SSE41-NEXT: [[TMP41:%.*]] = add nsw <4 x i32> [[TMP39]], [[TMP33]] +; SSE41-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]] +; SSE41-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP1]] +; SSE41-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP42]], i32 0 +; SSE41-NEXT: store <4 x i32> [[TMP40]], ptr [[TMP44]], align 4 +; SSE41-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP42]], i32 4 +; SSE41-NEXT: store <4 x i32> [[TMP41]], ptr [[TMP45]], align 4 ; SSE41-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; SSE41-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SSE41-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SSE41-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SSE41-NEXT: br i1 [[TMP46]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SSE41: middle.block: ; SSE41-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; SSE41-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -149,28 +173,28 @@ ; SSE41-NEXT: br label [[FOR_BODY:%.*]] ; SSE41: for.body: ; SSE41-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; SSE41-NEXT: [[TMP31:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 -; SSE41-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP31]] -; SSE41-NEXT: [[TMP32:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 -; SSE41-NEXT: [[CONV:%.*]] = sext i16 [[TMP32]] to i32 -; SSE41-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP31]] -; SSE41-NEXT: [[TMP33:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 -; SSE41-NEXT: [[CONV5:%.*]] = sext i16 [[TMP33]] to i32 +; SSE41-NEXT: [[TMP47:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; SSE41-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP47]] +; SSE41-NEXT: [[TMP48:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; SSE41-NEXT: [[CONV:%.*]] = sext i16 [[TMP48]] to i32 +; SSE41-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP47]] +; SSE41-NEXT: [[TMP49:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 +; SSE41-NEXT: [[CONV5:%.*]] = sext i16 [[TMP49]] to i32 ; SSE41-NEXT: [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]] -; SSE41-NEXT: [[TMP34:%.*]] = or i64 [[TMP31]], 1 -; SSE41-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP34]] -; SSE41-NEXT: [[TMP35:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 -; SSE41-NEXT: [[CONV11:%.*]] = sext i16 [[TMP35]] to i32 -; SSE41-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP34]] -; SSE41-NEXT: [[TMP36:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2 -; SSE41-NEXT: [[CONV16:%.*]] = sext i16 [[TMP36]] to i32 +; SSE41-NEXT: [[TMP50:%.*]] = or i64 [[TMP47]], 1 +; SSE41-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP50]] +; SSE41-NEXT: [[TMP51:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 +; SSE41-NEXT: [[CONV11:%.*]] = sext i16 [[TMP51]] to i32 +; SSE41-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP50]] +; SSE41-NEXT: [[TMP52:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2 +; SSE41-NEXT: [[CONV16:%.*]] = sext i16 [[TMP52]] to i32 ; SSE41-NEXT: [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]] ; SSE41-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]] ; SSE41-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[INDVARS_IV]] ; SSE41-NEXT: store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4 ; SSE41-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; SSE41-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; SSE41-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; SSE41-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; SSE41: for.end.loopexit: ; SSE41-NEXT: br label [[FOR_END]] ; SSE41: for.end: @@ -207,80 +231,112 @@ ; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0 ; AVX1-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0 ; AVX1-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP12]], align 2 -; AVX1-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2 -; AVX1-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP14]], align 2 -; AVX1-NEXT: [[WIDE_VEC3:%.*]] = load <8 x i16>, ptr [[TMP15]], align 2 ; AVX1-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[TMP16:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32> -; AVX1-NEXT: [[TMP17:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32> -; AVX1-NEXT: [[TMP18:%.*]] = sext <4 x i16> [[STRIDED_VEC5]] to <4 x i32> -; AVX1-NEXT: [[TMP19:%.*]] = sext <4 x i16> [[STRIDED_VEC6]] to <4 x i32> -; AVX1-NEXT: [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP4]] -; AVX1-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP5]] -; AVX1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP6]] -; AVX1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP7]] -; AVX1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[TMP20]], i32 0 -; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[TMP21]], i32 0 -; AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds i16, ptr [[TMP22]], i32 0 -; AVX1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i16, ptr [[TMP23]], i32 0 -; AVX1-NEXT: [[WIDE_VEC11:%.*]] = load <8 x i16>, ptr [[TMP24]], align 2 -; AVX1-NEXT: [[WIDE_VEC12:%.*]] = load <8 x i16>, ptr [[TMP25]], align 2 -; AVX1-NEXT: [[WIDE_VEC13:%.*]] = load <8 x i16>, ptr [[TMP26]], align 2 -; AVX1-NEXT: [[WIDE_VEC14:%.*]] = load <8 x i16>, ptr [[TMP27]], align 2 -; AVX1-NEXT: [[STRIDED_VEC15:%.*]] = shufflevector <8 x i16> [[WIDE_VEC11]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC16:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC18:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC19:%.*]] = shufflevector <8 x i16> [[WIDE_VEC11]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC20:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC21:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC22:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[TMP28:%.*]] = sext <4 x i16> [[STRIDED_VEC15]] to <4 x i32> -; AVX1-NEXT: [[TMP29:%.*]] = sext <4 x i16> [[STRIDED_VEC16]] to <4 x i32> -; AVX1-NEXT: [[TMP30:%.*]] = sext <4 x i16> [[STRIDED_VEC17]] to <4 x i32> -; AVX1-NEXT: [[TMP31:%.*]] = sext <4 x i16> [[STRIDED_VEC18]] to <4 x i32> -; AVX1-NEXT: [[TMP32:%.*]] = mul nsw <4 x i32> [[TMP28]], [[TMP16]] -; AVX1-NEXT: [[TMP33:%.*]] = mul nsw <4 x i32> [[TMP29]], [[TMP17]] -; AVX1-NEXT: [[TMP34:%.*]] = mul nsw <4 x i32> [[TMP30]], [[TMP18]] -; AVX1-NEXT: [[TMP35:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP19]] -; AVX1-NEXT: [[TMP36:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32> -; AVX1-NEXT: [[TMP37:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32> -; AVX1-NEXT: [[TMP38:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32> -; AVX1-NEXT: [[TMP39:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32> -; AVX1-NEXT: [[TMP40:%.*]] = sext <4 x i16> [[STRIDED_VEC19]] to <4 x i32> -; AVX1-NEXT: [[TMP41:%.*]] = sext <4 x i16> [[STRIDED_VEC20]] to <4 x i32> -; AVX1-NEXT: [[TMP42:%.*]] = sext <4 x i16> [[STRIDED_VEC21]] to <4 x i32> -; AVX1-NEXT: [[TMP43:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32> -; AVX1-NEXT: [[TMP44:%.*]] = mul nsw <4 x i32> [[TMP40]], [[TMP36]] -; AVX1-NEXT: [[TMP45:%.*]] = mul nsw <4 x i32> [[TMP41]], [[TMP37]] -; AVX1-NEXT: [[TMP46:%.*]] = mul nsw <4 x i32> [[TMP42]], [[TMP38]] -; AVX1-NEXT: [[TMP47:%.*]] = mul nsw <4 x i32> [[TMP43]], [[TMP39]] -; AVX1-NEXT: [[TMP48:%.*]] = add nsw <4 x i32> [[TMP44]], [[TMP32]] -; AVX1-NEXT: [[TMP49:%.*]] = add nsw <4 x i32> [[TMP45]], [[TMP33]] -; AVX1-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[TMP46]], [[TMP34]] -; AVX1-NEXT: [[TMP51:%.*]] = add nsw <4 x i32> [[TMP47]], [[TMP35]] -; AVX1-NEXT: [[TMP52:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP54:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 0 -; AVX1-NEXT: store <4 x i32> [[TMP48]], ptr [[TMP56]], align 4 -; AVX1-NEXT: [[TMP57:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 4 -; AVX1-NEXT: store <4 x i32> [[TMP49]], ptr [[TMP57]], align 4 -; AVX1-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 8 -; AVX1-NEXT: store <4 x i32> [[TMP50]], ptr [[TMP58]], align 4 -; AVX1-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 12 -; AVX1-NEXT: store <4 x i32> [[TMP51]], ptr [[TMP59]], align 4 +; AVX1-NEXT: [[TMP16:%.*]] = insertvalue [[TMP0]] poison, <4 x i16> [[STRIDED_VEC]], 0 +; AVX1-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP17:%.*]] = insertvalue [[TMP0]] [[TMP16]], <4 x i16> [[STRIDED_VEC1]], 1 +; AVX1-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2 +; AVX1-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP18:%.*]] = insertvalue [[TMP1]] poison, <4 x i16> [[STRIDED_VEC3]], 0 +; AVX1-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP19:%.*]] = insertvalue [[TMP1]] [[TMP18]], <4 x i16> [[STRIDED_VEC4]], 1 +; AVX1-NEXT: [[WIDE_VEC5:%.*]] = load <8 x i16>, ptr [[TMP14]], align 2 +; AVX1-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <8 x i16> [[WIDE_VEC5]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP20:%.*]] = insertvalue [[TMP2]] poison, <4 x i16> [[STRIDED_VEC6]], 0 +; AVX1-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x i16> [[WIDE_VEC5]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP21:%.*]] = insertvalue [[TMP2]] [[TMP20]], <4 x i16> [[STRIDED_VEC7]], 1 +; AVX1-NEXT: [[WIDE_VEC8:%.*]] = load <8 x i16>, ptr [[TMP15]], align 2 +; AVX1-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC8]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP22:%.*]] = insertvalue [[TMP3]] poison, <4 x i16> [[STRIDED_VEC9]], 0 +; AVX1-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC8]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP23:%.*]] = insertvalue [[TMP3]] [[TMP22]], <4 x i16> [[STRIDED_VEC10]], 1 +; AVX1-NEXT: [[TMP24:%.*]] = extractvalue [[TMP0]] [[TMP17]], 0 +; AVX1-NEXT: [[TMP25:%.*]] = extractvalue [[TMP1]] [[TMP19]], 0 +; AVX1-NEXT: [[TMP26:%.*]] = extractvalue [[TMP2]] [[TMP21]], 0 +; AVX1-NEXT: [[TMP27:%.*]] = extractvalue [[TMP3]] [[TMP23]], 0 +; AVX1-NEXT: [[TMP28:%.*]] = extractvalue [[TMP0]] [[TMP17]], 1 +; AVX1-NEXT: [[TMP29:%.*]] = extractvalue [[TMP1]] [[TMP19]], 1 +; AVX1-NEXT: [[TMP30:%.*]] = extractvalue [[TMP2]] [[TMP21]], 1 +; AVX1-NEXT: [[TMP31:%.*]] = extractvalue [[TMP3]] [[TMP23]], 1 +; AVX1-NEXT: [[TMP32:%.*]] = sext <4 x i16> [[TMP24]] to <4 x i32> +; AVX1-NEXT: [[TMP33:%.*]] = sext <4 x i16> [[TMP25]] to <4 x i32> +; AVX1-NEXT: [[TMP34:%.*]] = sext <4 x i16> [[TMP26]] to <4 x i32> +; AVX1-NEXT: [[TMP35:%.*]] = sext <4 x i16> [[TMP27]] to <4 x i32> +; AVX1-NEXT: [[TMP36:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP4]] +; AVX1-NEXT: [[TMP37:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP5]] +; AVX1-NEXT: [[TMP38:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP6]] +; AVX1-NEXT: [[TMP39:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP7]] +; AVX1-NEXT: [[TMP40:%.*]] = getelementptr inbounds i16, ptr [[TMP36]], i32 0 +; AVX1-NEXT: [[TMP41:%.*]] = getelementptr inbounds i16, ptr [[TMP37]], i32 0 +; AVX1-NEXT: [[TMP42:%.*]] = getelementptr inbounds i16, ptr [[TMP38]], i32 0 +; AVX1-NEXT: [[TMP43:%.*]] = getelementptr inbounds i16, ptr [[TMP39]], i32 0 +; AVX1-NEXT: [[WIDE_VEC11:%.*]] = load <8 x i16>, ptr [[TMP40]], align 2 +; AVX1-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <8 x i16> [[WIDE_VEC11]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP44:%.*]] = insertvalue [[TMP4]] poison, <4 x i16> [[STRIDED_VEC12]], 0 +; AVX1-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <8 x i16> [[WIDE_VEC11]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP45:%.*]] = insertvalue [[TMP4]] [[TMP44]], <4 x i16> [[STRIDED_VEC13]], 1 +; AVX1-NEXT: [[WIDE_VEC14:%.*]] = load <8 x i16>, ptr [[TMP41]], align 2 +; AVX1-NEXT: [[STRIDED_VEC15:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP46:%.*]] = insertvalue [[TMP5]] poison, <4 x i16> [[STRIDED_VEC15]], 0 +; AVX1-NEXT: [[STRIDED_VEC16:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP47:%.*]] = insertvalue [[TMP5]] [[TMP46]], <4 x i16> [[STRIDED_VEC16]], 1 +; AVX1-NEXT: [[WIDE_VEC17:%.*]] = load <8 x i16>, ptr [[TMP42]], align 2 +; AVX1-NEXT: [[STRIDED_VEC18:%.*]] = shufflevector <8 x i16> [[WIDE_VEC17]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP48:%.*]] = insertvalue [[TMP6]] poison, <4 x i16> [[STRIDED_VEC18]], 0 +; AVX1-NEXT: [[STRIDED_VEC19:%.*]] = shufflevector <8 x i16> [[WIDE_VEC17]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP49:%.*]] = insertvalue [[TMP6]] [[TMP48]], <4 x i16> [[STRIDED_VEC19]], 1 +; AVX1-NEXT: [[WIDE_VEC20:%.*]] = load <8 x i16>, ptr [[TMP43]], align 2 +; AVX1-NEXT: [[STRIDED_VEC21:%.*]] = shufflevector <8 x i16> [[WIDE_VEC20]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP50:%.*]] = insertvalue [[TMP7]] poison, <4 x i16> [[STRIDED_VEC21]], 0 +; AVX1-NEXT: [[STRIDED_VEC22:%.*]] = shufflevector <8 x i16> [[WIDE_VEC20]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP51:%.*]] = insertvalue [[TMP7]] [[TMP50]], <4 x i16> [[STRIDED_VEC22]], 1 +; AVX1-NEXT: [[TMP52:%.*]] = extractvalue [[TMP4]] [[TMP45]], 0 +; AVX1-NEXT: [[TMP53:%.*]] = extractvalue [[TMP5]] [[TMP47]], 0 +; AVX1-NEXT: [[TMP54:%.*]] = extractvalue [[TMP6]] [[TMP49]], 0 +; AVX1-NEXT: [[TMP55:%.*]] = extractvalue [[TMP7]] [[TMP51]], 0 +; AVX1-NEXT: [[TMP56:%.*]] = extractvalue [[TMP4]] [[TMP45]], 1 +; AVX1-NEXT: [[TMP57:%.*]] = extractvalue [[TMP5]] [[TMP47]], 1 +; AVX1-NEXT: [[TMP58:%.*]] = extractvalue [[TMP6]] [[TMP49]], 1 +; AVX1-NEXT: [[TMP59:%.*]] = extractvalue [[TMP7]] [[TMP51]], 1 +; AVX1-NEXT: [[TMP60:%.*]] = sext <4 x i16> [[TMP52]] to <4 x i32> +; AVX1-NEXT: [[TMP61:%.*]] = sext <4 x i16> [[TMP53]] to <4 x i32> +; AVX1-NEXT: [[TMP62:%.*]] = sext <4 x i16> [[TMP54]] to <4 x i32> +; AVX1-NEXT: [[TMP63:%.*]] = sext <4 x i16> [[TMP55]] to <4 x i32> +; AVX1-NEXT: [[TMP64:%.*]] = mul nsw <4 x i32> [[TMP60]], [[TMP32]] +; AVX1-NEXT: [[TMP65:%.*]] = mul nsw <4 x i32> [[TMP61]], [[TMP33]] +; AVX1-NEXT: [[TMP66:%.*]] = mul nsw <4 x i32> [[TMP62]], [[TMP34]] +; AVX1-NEXT: [[TMP67:%.*]] = mul nsw <4 x i32> [[TMP63]], [[TMP35]] +; AVX1-NEXT: [[TMP68:%.*]] = sext <4 x i16> [[TMP28]] to <4 x i32> +; AVX1-NEXT: [[TMP69:%.*]] = sext <4 x i16> [[TMP29]] to <4 x i32> +; AVX1-NEXT: [[TMP70:%.*]] = sext <4 x i16> [[TMP30]] to <4 x i32> +; AVX1-NEXT: [[TMP71:%.*]] = sext <4 x i16> [[TMP31]] to <4 x i32> +; AVX1-NEXT: [[TMP72:%.*]] = sext <4 x i16> [[TMP56]] to <4 x i32> +; AVX1-NEXT: [[TMP73:%.*]] = sext <4 x i16> [[TMP57]] to <4 x i32> +; AVX1-NEXT: [[TMP74:%.*]] = sext <4 x i16> [[TMP58]] to <4 x i32> +; AVX1-NEXT: [[TMP75:%.*]] = sext <4 x i16> [[TMP59]] to <4 x i32> +; AVX1-NEXT: [[TMP76:%.*]] = mul nsw <4 x i32> [[TMP72]], [[TMP68]] +; AVX1-NEXT: [[TMP77:%.*]] = mul nsw <4 x i32> [[TMP73]], [[TMP69]] +; AVX1-NEXT: [[TMP78:%.*]] = mul nsw <4 x i32> [[TMP74]], [[TMP70]] +; AVX1-NEXT: [[TMP79:%.*]] = mul nsw <4 x i32> [[TMP75]], [[TMP71]] +; AVX1-NEXT: [[TMP80:%.*]] = add nsw <4 x i32> [[TMP76]], [[TMP64]] +; AVX1-NEXT: [[TMP81:%.*]] = add nsw <4 x i32> [[TMP77]], [[TMP65]] +; AVX1-NEXT: [[TMP82:%.*]] = add nsw <4 x i32> [[TMP78]], [[TMP66]] +; AVX1-NEXT: [[TMP83:%.*]] = add nsw <4 x i32> [[TMP79]], [[TMP67]] +; AVX1-NEXT: [[TMP84:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP85:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP86:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP87:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP88:%.*]] = getelementptr inbounds i32, ptr [[TMP84]], i32 0 +; AVX1-NEXT: store <4 x i32> [[TMP80]], ptr [[TMP88]], align 4 +; AVX1-NEXT: [[TMP89:%.*]] = getelementptr inbounds i32, ptr [[TMP84]], i32 4 +; AVX1-NEXT: store <4 x i32> [[TMP81]], ptr [[TMP89]], align 4 +; AVX1-NEXT: [[TMP90:%.*]] = getelementptr inbounds i32, ptr [[TMP84]], i32 8 +; AVX1-NEXT: store <4 x i32> [[TMP82]], ptr [[TMP90]], align 4 +; AVX1-NEXT: [[TMP91:%.*]] = getelementptr inbounds i32, ptr [[TMP84]], i32 12 +; AVX1-NEXT: store <4 x i32> [[TMP83]], ptr [[TMP91]], align 4 ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; AVX1-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; AVX1-NEXT: [[TMP92:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX1-NEXT: br i1 [[TMP92]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -289,28 +345,28 @@ ; AVX1-NEXT: br label [[FOR_BODY:%.*]] ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; AVX1-NEXT: [[TMP61:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 -; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP61]] -; AVX1-NEXT: [[TMP62:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 -; AVX1-NEXT: [[CONV:%.*]] = sext i16 [[TMP62]] to i32 -; AVX1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP61]] -; AVX1-NEXT: [[TMP63:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 -; AVX1-NEXT: [[CONV5:%.*]] = sext i16 [[TMP63]] to i32 +; AVX1-NEXT: [[TMP93:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP93]] +; AVX1-NEXT: [[TMP94:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; AVX1-NEXT: [[CONV:%.*]] = sext i16 [[TMP94]] to i32 +; AVX1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP93]] +; AVX1-NEXT: [[TMP95:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 +; AVX1-NEXT: [[CONV5:%.*]] = sext i16 [[TMP95]] to i32 ; AVX1-NEXT: [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]] -; AVX1-NEXT: [[TMP64:%.*]] = or i64 [[TMP61]], 1 -; AVX1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP64]] -; AVX1-NEXT: [[TMP65:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 -; AVX1-NEXT: [[CONV11:%.*]] = sext i16 [[TMP65]] to i32 -; AVX1-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP64]] -; AVX1-NEXT: [[TMP66:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2 -; AVX1-NEXT: [[CONV16:%.*]] = sext i16 [[TMP66]] to i32 +; AVX1-NEXT: [[TMP96:%.*]] = or i64 [[TMP93]], 1 +; AVX1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP96]] +; AVX1-NEXT: [[TMP97:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 +; AVX1-NEXT: [[CONV11:%.*]] = sext i16 [[TMP97]] to i32 +; AVX1-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP96]] +; AVX1-NEXT: [[TMP98:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2 +; AVX1-NEXT: [[CONV16:%.*]] = sext i16 [[TMP98]] to i32 ; AVX1-NEXT: [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]] ; AVX1-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]] ; AVX1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[INDVARS_IV]] ; AVX1-NEXT: store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4 ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; AVX1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; AVX1: for.end.loopexit: ; AVX1-NEXT: br label [[FOR_END]] ; AVX1: for.end: @@ -336,25 +392,33 @@ ; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 0 ; AVX2-NEXT: [[WIDE_VEC:%.*]] = load <16 x i16>, ptr [[TMP3]], align 2 ; AVX2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <8 x i32> +; AVX2-NEXT: [[TMP4:%.*]] = insertvalue [[TMP0]] poison, <8 x i16> [[STRIDED_VEC]], 0 ; AVX2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <8 x i32> -; AVX2-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[STRIDED_VEC]] to <8 x i32> -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 -; AVX2-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i16>, ptr [[TMP6]], align 2 +; AVX2-NEXT: [[TMP5:%.*]] = insertvalue [[TMP0]] [[TMP4]], <8 x i16> [[STRIDED_VEC1]], 1 +; AVX2-NEXT: [[TMP6:%.*]] = extractvalue [[TMP0]] [[TMP5]], 0 +; AVX2-NEXT: [[TMP7:%.*]] = extractvalue [[TMP0]] [[TMP5]], 1 +; AVX2-NEXT: [[TMP8:%.*]] = sext <8 x i16> [[TMP6]] to <8 x i32> +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP9]], i32 0 +; AVX2-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i16>, ptr [[TMP10]], align 2 ; AVX2-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i16> [[WIDE_VEC2]], <16 x i16> poison, <8 x i32> +; AVX2-NEXT: [[TMP11:%.*]] = insertvalue [[TMP1]] poison, <8 x i16> [[STRIDED_VEC3]], 0 ; AVX2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i16> [[WIDE_VEC2]], <16 x i16> poison, <8 x i32> -; AVX2-NEXT: [[TMP7:%.*]] = sext <8 x i16> [[STRIDED_VEC3]] to <8 x i32> -; AVX2-NEXT: [[TMP8:%.*]] = mul nsw <8 x i32> [[TMP7]], [[TMP4]] -; AVX2-NEXT: [[TMP9:%.*]] = sext <8 x i16> [[STRIDED_VEC1]] to <8 x i32> -; AVX2-NEXT: [[TMP10:%.*]] = sext <8 x i16> [[STRIDED_VEC4]] to <8 x i32> -; AVX2-NEXT: [[TMP11:%.*]] = mul nsw <8 x i32> [[TMP10]], [[TMP9]] -; AVX2-NEXT: [[TMP12:%.*]] = add nsw <8 x i32> [[TMP11]], [[TMP8]] -; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 -; AVX2-NEXT: store <8 x i32> [[TMP12]], ptr [[TMP14]], align 4 +; AVX2-NEXT: [[TMP12:%.*]] = insertvalue [[TMP1]] [[TMP11]], <8 x i16> [[STRIDED_VEC4]], 1 +; AVX2-NEXT: [[TMP13:%.*]] = extractvalue [[TMP1]] [[TMP12]], 0 +; AVX2-NEXT: [[TMP14:%.*]] = extractvalue [[TMP1]] [[TMP12]], 1 +; AVX2-NEXT: [[TMP15:%.*]] = sext <8 x i16> [[TMP13]] to <8 x i32> +; AVX2-NEXT: [[TMP16:%.*]] = mul nsw <8 x i32> [[TMP15]], [[TMP8]] +; AVX2-NEXT: [[TMP17:%.*]] = sext <8 x i16> [[TMP7]] to <8 x i32> +; AVX2-NEXT: [[TMP18:%.*]] = sext <8 x i16> [[TMP14]] to <8 x i32> +; AVX2-NEXT: [[TMP19:%.*]] = mul nsw <8 x i32> [[TMP18]], [[TMP17]] +; AVX2-NEXT: [[TMP20:%.*]] = add nsw <8 x i32> [[TMP19]], [[TMP16]] +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0 +; AVX2-NEXT: store <8 x i32> [[TMP20]], ptr [[TMP22]], align 4 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; AVX2-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; AVX2-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX2-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -363,28 +427,28 @@ ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; AVX2-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 -; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP16]] -; AVX2-NEXT: [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 -; AVX2-NEXT: [[CONV:%.*]] = sext i16 [[TMP17]] to i32 -; AVX2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP16]] -; AVX2-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 -; AVX2-NEXT: [[CONV5:%.*]] = sext i16 [[TMP18]] to i32 +; AVX2-NEXT: [[TMP24:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP24]] +; AVX2-NEXT: [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; AVX2-NEXT: [[CONV:%.*]] = sext i16 [[TMP25]] to i32 +; AVX2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP24]] +; AVX2-NEXT: [[TMP26:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 +; AVX2-NEXT: [[CONV5:%.*]] = sext i16 [[TMP26]] to i32 ; AVX2-NEXT: [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]] -; AVX2-NEXT: [[TMP19:%.*]] = or i64 [[TMP16]], 1 -; AVX2-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP19]] -; AVX2-NEXT: [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 -; AVX2-NEXT: [[CONV11:%.*]] = sext i16 [[TMP20]] to i32 -; AVX2-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP19]] -; AVX2-NEXT: [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2 -; AVX2-NEXT: [[CONV16:%.*]] = sext i16 [[TMP21]] to i32 +; AVX2-NEXT: [[TMP27:%.*]] = or i64 [[TMP24]], 1 +; AVX2-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP27]] +; AVX2-NEXT: [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 +; AVX2-NEXT: [[CONV11:%.*]] = sext i16 [[TMP28]] to i32 +; AVX2-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP27]] +; AVX2-NEXT: [[TMP29:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2 +; AVX2-NEXT: [[CONV16:%.*]] = sext i16 [[TMP29]] to i32 ; AVX2-NEXT: [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]] ; AVX2-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]] ; AVX2-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[INDVARS_IV]] ; AVX2-NEXT: store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4 ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX2-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; AVX2-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; AVX2: for.end.loopexit: ; AVX2-NEXT: br label [[FOR_END]] ; AVX2: for.end: Index: llvm/test/Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll +++ llvm/test/Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll @@ -22,12 +22,18 @@ ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <96 x i8>, ptr [[TMP2]], align 1 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <96 x i8> [[WIDE_VEC]], <96 x i8> poison, <32 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 -; CHECK-NEXT: store <32 x i8> [[STRIDED_VEC]], ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[TMP0]] poison, <32 x i8> [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <96 x i8> [[WIDE_VEC]], <96 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = insertvalue [[TMP0]] [[TMP3]], <32 x i8> [[STRIDED_VEC1]], 1 +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <96 x i8> [[WIDE_VEC]], <96 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[TMP0]] [[TMP4]], <32 x i8> [[STRIDED_VEC2]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue [[TMP0]] [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-NEXT: store <32 x i8> [[TMP6]], ptr [[TMP8]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: @@ -36,18 +42,24 @@ ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x i8], ptr [[SRC]], i64 [[TMP6]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <24 x i8>, ptr [[TMP8]], align 1 -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <24 x i8> [[WIDE_VEC2]], <24 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 -; CHECK-NEXT: store <8 x i8> [[STRIDED_VEC3]], ptr [[TMP10]], align 1 -; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[OFFSET_IDX]], 8 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 24 -; CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: [[INDEX3:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX3]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8], ptr [[SRC]], i64 [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-NEXT: [[WIDE_VEC4:%.*]] = load <24 x i8>, ptr [[TMP12]], align 1 +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <24 x i8> [[WIDE_VEC4]], <24 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = insertvalue [[TMP1]] poison, <8 x i8> [[STRIDED_VEC5]], 0 +; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <24 x i8> [[WIDE_VEC4]], <24 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = insertvalue [[TMP1]] [[TMP13]], <8 x i8> [[STRIDED_VEC6]], 1 +; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <24 x i8> [[WIDE_VEC4]], <24 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = insertvalue [[TMP1]] [[TMP14]], <8 x i8> [[STRIDED_VEC7]], 2 +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue [[TMP1]] [[TMP15]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i32 0 +; CHECK-NEXT: store <8 x i8> [[TMP16]], ptr [[TMP18]], align 1 +; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[INDEX3]], 8 +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT8]], 24 +; CHECK-NEXT: br i1 [[TMP19]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: Index: llvm/test/Transforms/LoopVectorize/induction.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/induction.ll +++ llvm/test/Transforms/LoopVectorize/induction.ll @@ -1181,21 +1181,21 @@ ; INTERLEAVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[OFFSET_IDX]] ; INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]] ; INTERLEAVE-NEXT: [[WIDE_VEC:%.*]] = load <32 x float>, ptr [[TMP6]], align 4 -; INTERLEAVE-NEXT: [[WIDE_VEC2:%.*]] = load <32 x float>, ptr [[TMP7]], align 4 ; INTERLEAVE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x float> [[WIDE_VEC]], <32 x float> poison, <4 x i32> -; INTERLEAVE-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <32 x float> [[WIDE_VEC2]], <32 x float> poison, <4 x i32> +; INTERLEAVE-NEXT: [[WIDE_VEC9:%.*]] = load <32 x float>, ptr [[TMP7]], align 4 +; INTERLEAVE-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <32 x float> [[WIDE_VEC9]], <32 x float> poison, <4 x i32> ; INTERLEAVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[OFFSET_IDX]] ; INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]] -; INTERLEAVE-NEXT: [[WIDE_VEC4:%.*]] = load <32 x float>, ptr [[TMP8]], align 4 -; INTERLEAVE-NEXT: [[WIDE_VEC5:%.*]] = load <32 x float>, ptr [[TMP9]], align 4 -; INTERLEAVE-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <32 x float> [[WIDE_VEC4]], <32 x float> poison, <4 x i32> -; INTERLEAVE-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <32 x float> [[WIDE_VEC5]], <32 x float> poison, <4 x i32> +; INTERLEAVE-NEXT: [[WIDE_VEC18:%.*]] = load <32 x float>, ptr [[TMP8]], align 4 +; INTERLEAVE-NEXT: [[STRIDED_VEC19:%.*]] = shufflevector <32 x float> [[WIDE_VEC18]], <32 x float> poison, <4 x i32> +; INTERLEAVE-NEXT: [[WIDE_VEC27:%.*]] = load <32 x float>, ptr [[TMP9]], align 4 +; INTERLEAVE-NEXT: [[STRIDED_VEC28:%.*]] = shufflevector <32 x float> [[WIDE_VEC27]], <32 x float> poison, <4 x i32> ; INTERLEAVE-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[VEC_PHI]], ; INTERLEAVE-NEXT: [[TMP11:%.*]] = fadd fast <4 x float> [[VEC_PHI1]], ; INTERLEAVE-NEXT: [[TMP12:%.*]] = fadd fast <4 x float> [[TMP10]], [[STRIDED_VEC]] -; INTERLEAVE-NEXT: [[TMP13:%.*]] = fadd fast <4 x float> [[TMP11]], [[STRIDED_VEC3]] -; INTERLEAVE-NEXT: [[TMP14]] = fadd fast <4 x float> [[TMP12]], [[STRIDED_VEC6]] -; INTERLEAVE-NEXT: [[TMP15]] = fadd fast <4 x float> [[TMP13]], [[STRIDED_VEC7]] +; INTERLEAVE-NEXT: [[TMP13:%.*]] = fadd fast <4 x float> [[TMP11]], [[STRIDED_VEC10]] +; INTERLEAVE-NEXT: [[TMP14]] = fadd fast <4 x float> [[TMP12]], [[STRIDED_VEC19]] +; INTERLEAVE-NEXT: [[TMP15]] = fadd fast <4 x float> [[TMP13]], [[STRIDED_VEC28]] ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] @@ -1480,8 +1480,8 @@ ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]] ; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i64 0 ; INTERLEAVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[Y]], i64 0 -; INTERLEAVE-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer +; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> poison, i32 [[Y]], i64 0 +; INTERLEAVE-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> poison, <4 x i32> zeroinitializer ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1501,11 +1501,11 @@ ; INTERLEAVE-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP7]], i32 1 ; INTERLEAVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP8]], i32 1 ; INTERLEAVE-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP9]], align 8 -; INTERLEAVE-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP13]], align 8 ; INTERLEAVE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; INTERLEAVE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> +; INTERLEAVE-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i32>, ptr [[TMP13]], align 8 +; INTERLEAVE-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> ; INTERLEAVE-NEXT: [[TMP17:%.*]] = xor <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] -; INTERLEAVE-NEXT: [[TMP18:%.*]] = xor <4 x i32> [[STRIDED_VEC2]], [[BROADCAST_SPLAT4]] +; INTERLEAVE-NEXT: [[TMP18:%.*]] = xor <4 x i32> [[STRIDED_VEC3]], [[BROADCAST_SPLAT6]] ; INTERLEAVE-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[TMP17]], i64 0 ; INTERLEAVE-NEXT: store i32 [[TMP19]], ptr [[TMP9]], align 8 ; INTERLEAVE-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP17]], i64 1 @@ -1892,7 +1892,7 @@ ; INTERLEAVE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] ; INTERLEAVE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]] ; INTERLEAVE-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP20]], align 1 -; INTERLEAVE-NEXT: [[WIDE_VEC3:%.*]] = load <16 x i32>, ptr [[TMP21]], align 1 +; INTERLEAVE-NEXT: [[WIDE_VEC6:%.*]] = load <16 x i32>, ptr [[TMP21]], align 1 ; INTERLEAVE-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]], i32 1 ; INTERLEAVE-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]], i32 1 ; INTERLEAVE-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP12]], i32 1 @@ -1909,13 +1909,13 @@ ; INTERLEAVE-NEXT: store i32 [[TMP32]], ptr [[TMP24]], align 1, !alias.scope !17, !noalias !20 ; INTERLEAVE-NEXT: [[TMP33:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 12 ; INTERLEAVE-NEXT: store i32 [[TMP33]], ptr [[TMP25]], align 1, !alias.scope !17, !noalias !20 -; INTERLEAVE-NEXT: [[TMP34:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 0 +; INTERLEAVE-NEXT: [[TMP34:%.*]] = extractelement <16 x i32> [[WIDE_VEC6]], i64 0 ; INTERLEAVE-NEXT: store i32 [[TMP34]], ptr [[TMP26]], align 1, !alias.scope !17, !noalias !20 -; INTERLEAVE-NEXT: [[TMP35:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 4 +; INTERLEAVE-NEXT: [[TMP35:%.*]] = extractelement <16 x i32> [[WIDE_VEC6]], i64 4 ; INTERLEAVE-NEXT: store i32 [[TMP35]], ptr [[TMP27]], align 1, !alias.scope !17, !noalias !20 -; INTERLEAVE-NEXT: [[TMP36:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 8 +; INTERLEAVE-NEXT: [[TMP36:%.*]] = extractelement <16 x i32> [[WIDE_VEC6]], i64 8 ; INTERLEAVE-NEXT: store i32 [[TMP36]], ptr [[TMP28]], align 1, !alias.scope !17, !noalias !20 -; INTERLEAVE-NEXT: [[TMP37:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 12 +; INTERLEAVE-NEXT: [[TMP37:%.*]] = extractelement <16 x i32> [[WIDE_VEC6]], i64 12 ; INTERLEAVE-NEXT: store i32 [[TMP37]], ptr [[TMP29]], align 1, !alias.scope !17, !noalias !20 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] Index: llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll +++ llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll @@ -29,28 +29,28 @@ ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i64> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i64 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i64> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i64 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP2_1:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P]], i64 [[INDEX]], i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[WIDE_VEC]], i64 0 -; CHECK-NEXT: store i64 [[TMP6]], ptr [[TMP2_1]], align 8 +; CHECK-NEXT: store i64 [[TMP6]], ptr [[TMP5]], align 8 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP4]], i64 1 -; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] -; CHECK: pred.store.if1: +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP3]], i64 1 +; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] +; CHECK: pred.store.if2: ; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P]], i64 [[TMP8]], i32 1 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[WIDE_VEC]], i64 2 ; CHECK-NEXT: store i64 [[TMP10]], ptr [[TMP9]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] -; CHECK: pred.store.continue2: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE3]] +; CHECK: pred.store.continue3: ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -71,7 +71,7 @@ ; CHECK: if.merge: ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -124,38 +124,38 @@ ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] ; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P]], i64 [[INDEX]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P]], i64 [[TMP2]], i32 1 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <2 x i64> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i64 0 -; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <2 x i64> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i64 0 +; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: -; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P]], i64 [[INDEX]], i32 0 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[WIDE_VEC]], i64 0 -; CHECK-NEXT: store i64 [[TMP9]], ptr [[PTR0]], align 8 +; CHECK-NEXT: store i64 [[TMP9]], ptr [[TMP8]], align 8 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i64 1 -; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] -; CHECK: pred.store.if1: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP6]], i64 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] +; CHECK: pred.store.if2: ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P]], i64 [[TMP2]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[WIDE_VEC]], i64 2 ; CHECK-NEXT: store i64 [[TMP12]], ptr [[TMP11]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] -; CHECK: pred.store.continue2: -; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[WIDE_VEC3]], i64 0 -; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i64> [[WIDE_VEC3]], i64 2 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP5]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE3]] +; CHECK: pred.store.continue3: +; CHECK-NEXT: [[WIDE_VEC4:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[WIDE_VEC4]], i64 0 +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[WIDE_VEC4]], i64 2 +; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP5]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -165,15 +165,15 @@ ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[IF_MERGE:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[P_0:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P]], i64 [[I]], i32 0 ; CHECK-NEXT: [[P_1:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P]], i64 [[I]], i32 1 -; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[P_1]], align 8 -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP17]], [[X]] -; CHECK-NEXT: br i1 [[TMP18]], label [[IF_THEN:%.*]], label [[IF_MERGE]] +; CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr [[P_1]], align 8 +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[TMP16]], [[X]] +; CHECK-NEXT: br i1 [[TMP17]], label [[IF_THEN:%.*]], label [[IF_MERGE]] ; CHECK: if.then: -; CHECK-NEXT: store i64 [[TMP17]], ptr [[P_0]], align 8 +; CHECK-NEXT: store i64 [[TMP16]], ptr [[P_0]], align 8 ; CHECK-NEXT: br label [[IF_MERGE]] ; CHECK: if.merge: -; CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[P_0]], align 8 -; CHECK-NEXT: store i64 [[TMP19]], ptr [[P_1]], align 8 +; CHECK-NEXT: [[TMP18:%.*]] = load i64, ptr [[P_0]], align 8 +; CHECK-NEXT: store i64 [[TMP18]], ptr [[P_1]], align 8 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP5:![0-9]+]] @@ -231,7 +231,7 @@ ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] ; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P]], i64 [[TMP2]], i32 0 @@ -240,23 +240,23 @@ ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> ; CHECK-NEXT: store i64 [[X]], ptr [[TMP3]], align 8 ; CHECK-NEXT: store i64 [[X]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <2 x i64> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i64 0 -; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <2 x i64> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i64 0 +; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: -; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P]], i64 [[INDEX]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P]], i64 [[INDEX]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[WIDE_VEC]], i64 0 -; CHECK-NEXT: store i64 [[TMP9]], ptr [[PTR1]], align 8 +; CHECK-NEXT: store i64 [[TMP9]], ptr [[TMP8]], align 8 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i64 1 -; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] -; CHECK: pred.store.if1: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP6]], i64 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] +; CHECK: pred.store.if2: ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P]], i64 [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[WIDE_VEC]], i64 2 ; CHECK-NEXT: store i64 [[TMP12]], ptr [[TMP11]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] -; CHECK: pred.store.continue2: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE3]] +; CHECK: pred.store.continue3: ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] Index: llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll +++ llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -352,8 +352,8 @@ ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 -6 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP1]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC1]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[REVERSE]], [[VEC_IND]] ; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[REVERSE2]], [[VEC_IND]] @@ -1017,9 +1017,9 @@ ; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP8]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6 ; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP9]], align 4 -; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4 -; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP14]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]] +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP14]] = add <4 x i32> [[STRIDED_VEC3]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] @@ -1204,9 +1204,9 @@ ; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP10]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6 ; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP12]], align 4 -; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4 -; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP17]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]] +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP17]] = add <4 x i32> [[STRIDED_VEC3]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]