Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -664,6 +664,12 @@ bool isLegalMaskedScatter(Type *DataType, Align Alignment) const; /// Return true if the target supports masked gather. bool isLegalMaskedGather(Type *DataType, Align Alignment) const; + /// Return true if the target forces scalarizing of llvm.masked.gather + /// intrinsics. + bool forceScalarizeMaskedGather(Type *Type, Align Alignment) const; + /// Return true if the target forces scalarizing of llvm.masked.scatter + /// intrinsics. + bool forceScalarizeMaskedScatter(Type *Type, Align Alignment) const; /// Return true if the target supports masked compress store. bool isLegalMaskedCompressStore(Type *DataType) const; @@ -1543,6 +1549,8 @@ virtual bool isLegalNTLoad(Type *DataType, Align Alignment) = 0; virtual bool isLegalMaskedScatter(Type *DataType, Align Alignment) = 0; virtual bool isLegalMaskedGather(Type *DataType, Align Alignment) = 0; + virtual bool forceScalarizeMaskedGather(Type *DataType, Align Alignment) = 0; + virtual bool forceScalarizeMaskedScatter(Type *DataType, Align Alignment) = 0; virtual bool isLegalMaskedCompressStore(Type *DataType) = 0; virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0; virtual bool enableOrderedReductions() = 0; @@ -1945,6 +1953,12 @@ bool isLegalMaskedGather(Type *DataType, Align Alignment) override { return Impl.isLegalMaskedGather(DataType, Alignment); } + bool forceScalarizeMaskedGather(Type *DataType, Align Alignment) override { + return Impl.forceScalarizeMaskedGather(DataType, Alignment); + } + bool forceScalarizeMaskedScatter(Type *DataType, Align Alignment) override { + return Impl.forceScalarizeMaskedScatter(DataType, Alignment); + } bool isLegalMaskedCompressStore(Type *DataType) override { return Impl.isLegalMaskedCompressStore(DataType); } Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -267,6 +267,14 @@ return false; } + bool forceScalarizeMaskedGather(Type *DataType, Align Alignment) const { + return false; + } + + bool forceScalarizeMaskedScatter(Type *DataType, Align Alignment) const { + return false; + } + bool isLegalMaskedCompressStore(Type *DataType) const { return false; } bool isLegalMaskedExpandLoad(Type *DataType) const { return false; } Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -408,6 +408,16 @@ return TTIImpl->isLegalMaskedScatter(DataType, Alignment); } +bool TargetTransformInfo::forceScalarizeMaskedGather(Type *DataType, + Align Alignment) const { + return TTIImpl->forceScalarizeMaskedGather(DataType, Alignment); +} + +bool TargetTransformInfo::forceScalarizeMaskedScatter(Type *DataType, + Align Alignment) const { + return TTIImpl->forceScalarizeMaskedScatter(DataType, Alignment); +} + bool TargetTransformInfo::isLegalMaskedCompressStore(Type *DataType) const { return TTIImpl->isLegalMaskedCompressStore(DataType); } Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -189,6 +189,18 @@ return isLegalMaskedLoad(DataTy, Alignment); } + bool forceScalarizeMaskedGather(Type *VTy, Align Alignment) { + // For MVE, we have a custom lowering pass that will already have custom + // legalised any gathers that we can to MVE intrinsics, and want to expand + // all the rest. The pass runs before the masked intrinsic lowering pass, so + // if we are here, we know we want to expand. + return true; + } + + bool forceScalarizeMaskedScatter(Type *VTy, Align Alignment) { + return forceScalarizeMaskedGather(VTy, Alignment); + } + bool isLegalMaskedGather(Type *Ty, Align Alignment); bool isLegalMaskedScatter(Type *Ty, Align Alignment) { Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1116,18 +1116,6 @@ if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps()) return false; - // This method is called in 2 places: - // - from the vectorizer with a scalar type, in which case we need to get - // this as good as we can with the limited info we have (and rely on the cost - // model for the rest). - // - from the masked intrinsic lowering pass with the actual vector type. - // For MVE, we have a custom lowering pass that will already have custom - // legalised any gathers that we can to MVE intrinsics, and want to expand all - // the rest. The pass runs before the masked intrinsic lowering pass, so if we - // are here, we know we want to expand. - if (isa(Ty)) - return false; - unsigned EltWidth = Ty->getScalarSizeInBits(); return ((EltWidth == 32 && Alignment >= 4) || (EltWidth == 16 && Alignment >= 2) || EltWidth == 8); Index: llvm/lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -5120,16 +5120,9 @@ return false; // This function is called now in two cases: from the Loop Vectorizer - // and from the Scalarizer. - // When the Loop Vectorizer asks about legality of the feature, - // the vectorization factor is not calculated yet. The Loop Vectorizer - // sends a scalar type and the decision is based on the width of the - // scalar element. - // Later on, the cost model will estimate usage this intrinsic based on - // the vector type. - // The Scalarizer asks again about legality. It sends a vector type. - // In this case we can reject non-power-of-2 vectors. - // We also reject single element vectors as the type legalizer can't + // and from the Scalarizer to ask about legality of the feature. + // In the case of a vector type being sent we can reject non-power-of-2 + // vectors. We also reject single element vectors as the type legalizer can't // scalarize it. if (auto *DataVTy = dyn_cast(DataTy)) { unsigned NumElts = DataVTy->getNumElements(); Index: llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp =================================================================== --- llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp +++ llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp @@ -959,7 +959,8 @@ Type *LoadTy = CI->getType(); Align Alignment = DL.getValueOrABITypeAlignment(MA, LoadTy->getScalarType()); - if (TTI.isLegalMaskedGather(LoadTy, Alignment)) + if (TTI.isLegalMaskedGather(LoadTy, Alignment) && + !TTI.forceScalarizeMaskedGather(LoadTy, Alignment)) return false; scalarizeMaskedGather(DL, CI, DTU, ModifiedDT); return true; @@ -970,7 +971,8 @@ Type *StoreTy = CI->getArgOperand(0)->getType(); Align Alignment = DL.getValueOrABITypeAlignment(MA, StoreTy->getScalarType()); - if (TTI.isLegalMaskedScatter(StoreTy, Alignment)) + if (TTI.isLegalMaskedScatter(StoreTy, Alignment) && + !TTI.forceScalarizeMaskedScatter(StoreTy, Alignment)) return false; scalarizeMaskedScatter(DL, CI, DTU, ModifiedDT); return true; Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1577,13 +1577,16 @@ /// Returns true if the target machine can represent \p V as a masked gather /// or scatter operation. - bool isLegalGatherOrScatter(Value *V) { + bool isLegalGatherOrScatter(Value *V, + ElementCount VF = ElementCount::getFixed(1)) { bool LI = isa(V); bool SI = isa(V); if (!LI && !SI) return false; auto *Ty = getLoadStoreType(V); Align Align = getLoadStoreAlignment(V); + if (VF.isVector()) + Ty = VectorType::get(Ty, VF); return (LI && TTI.isLegalMaskedGather(Ty, Align)) || (SI && TTI.isLegalMaskedScatter(Ty, Align)); } @@ -7226,7 +7229,7 @@ // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract InstructionCost Cost; if (isa(&I) && VF.isScalable() && - isLegalGatherOrScatter(&I)) { + isLegalGatherOrScatter(&I, VF)) { Cost = getGatherScatterCost(&I, VF); setWideningDecision(&I, VF, CM_GatherScatter, Cost); } else { @@ -7268,7 +7271,7 @@ } InstructionCost GatherScatterCost = - isLegalGatherOrScatter(&I) + isLegalGatherOrScatter(&I, VF) ? getGatherScatterCost(&I, VF) * NumAccesses : InstructionCost::getInvalid(); Index: llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll +++ llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll @@ -50,8 +50,8 @@ ; AVX2-FASTGATHER: LV: Found an estimated cost of 48 for VF 32 For instruction: %valB.loaded = load i32, i32* %inB, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i32, i32* %inB, align 4 -; AVX512: LV: Found an estimated cost of 10 for VF 2 For instruction: %valB.loaded = load i32, i32* %inB, align 4 -; AVX512: LV: Found an estimated cost of 22 for VF 4 For instruction: %valB.loaded = load i32, i32* %inB, align 4 +; AVX512: LV: Found an estimated cost of 3000000 for VF 2 For instruction: %valB.loaded = load i32, i32* %inB, align 4 +; AVX512: LV: Found an estimated cost of 3000000 for VF 4 For instruction: %valB.loaded = load i32, i32* %inB, align 4 ; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %valB.loaded = load i32, i32* %inB, align 4 ; AVX512: LV: Found an estimated cost of 18 for VF 16 For instruction: %valB.loaded = load i32, i32* %inB, align 4 ; AVX512: LV: Found an estimated cost of 36 for VF 32 For instruction: %valB.loaded = load i32, i32* %inB, align 4 Index: llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll +++ llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll @@ -50,8 +50,8 @@ ; AVX2-FASTGATHER: LV: Found an estimated cost of 48 for VF 32 For instruction: %valB.loaded = load i64, i64* %inB, align 8 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i64, i64* %inB, align 8 -; AVX512: LV: Found an estimated cost of 10 for VF 2 For instruction: %valB.loaded = load i64, i64* %inB, align 8 -; AVX512: LV: Found an estimated cost of 24 for VF 4 For instruction: %valB.loaded = load i64, i64* %inB, align 8 +; AVX512: LV: Found an estimated cost of 3000000 for VF 2 For instruction: %valB.loaded = load i64, i64* %inB, align 8 +; AVX512: LV: Found an estimated cost of 3000000 for VF 4 For instruction: %valB.loaded = load i64, i64* %inB, align 8 ; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %valB.loaded = load i64, i64* %inB, align 8 ; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %valB.loaded = load i64, i64* %inB, align 8 ; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %valB.loaded = load i64, i64* %inB, align 8 Index: llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll @@ -90,4 +90,4 @@ ret void } -attributes #0 = { "target-features"="+neon,+sve,+v8.1a" } +attributes #0 = { "target-features"="+neon,+sve,+v8.1a" vscale_range(2, 0) } Index: llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -95,66 +95,75 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_1:%.*]], [[PRED_LOAD_CONTINUE9_1:%.*]] ] ; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX7]] ; FVW2-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>* ; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 -; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; FVW2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* -; FVW2-NEXT: [[WIDE_LOAD8:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 -; FVW2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 -; FVW2-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>* -; FVW2-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4 -; FVW2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; FVW2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>* -; FVW2-NEXT: [[WIDE_LOAD10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP7]], align 4 -; FVW2-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer -; FVW2-NEXT: [[TMP9:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD8]], zeroinitializer -; FVW2-NEXT: [[TMP10:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD9]], zeroinitializer -; FVW2-NEXT: [[TMP11:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD10]], zeroinitializer -; FVW2-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[INDEX:%.*]], i64 [[INDEX7]] -; FVW2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>* -; FVW2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP13]], i32 4, <2 x i1> [[TMP8]], <2 x i32> poison) -; FVW2-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i64 2 -; FVW2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>* -; FVW2-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP15]], i32 4, <2 x i1> [[TMP9]], <2 x i32> poison) -; FVW2-NEXT: [[TMP16:%.*]] = getelementptr i32, i32* [[TMP12]], i64 4 -; FVW2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>* -; FVW2-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP17]], i32 4, <2 x i1> [[TMP10]], <2 x i32> poison) -; FVW2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 6 -; FVW2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <2 x i32>* -; FVW2-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP19]], i32 4, <2 x i1> [[TMP11]], <2 x i32> poison) -; FVW2-NEXT: [[TMP20:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64> -; FVW2-NEXT: [[TMP21:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD11]] to <2 x i64> -; FVW2-NEXT: [[TMP22:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD12]] to <2 x i64> -; FVW2-NEXT: [[TMP23:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD13]] to <2 x i64> -; FVW2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <2 x i64> [[TMP20]] -; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP21]] -; FVW2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP22]] -; FVW2-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP23]] -; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP24]], i32 4, <2 x i1> [[TMP8]], <2 x float> undef) -; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP25]], i32 4, <2 x i1> [[TMP9]], <2 x float> undef) -; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP26]], i32 4, <2 x i1> [[TMP10]], <2 x float> undef) -; FVW2-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP27]], i32 4, <2 x i1> [[TMP11]], <2 x float> undef) -; FVW2-NEXT: [[TMP28:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], -; FVW2-NEXT: [[TMP29:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], -; FVW2-NEXT: [[TMP30:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], -; FVW2-NEXT: [[TMP31:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], -; FVW2-NEXT: [[TMP32:%.*]] = getelementptr float, float* [[OUT:%.*]], i64 [[INDEX7]] -; FVW2-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <2 x float>* -; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP28]], <2 x float>* [[TMP33]], i32 4, <2 x i1> [[TMP8]]) -; FVW2-NEXT: [[TMP34:%.*]] = getelementptr float, float* [[TMP32]], i64 2 -; FVW2-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to <2 x float>* -; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP29]], <2 x float>* [[TMP35]], i32 4, <2 x i1> [[TMP9]]) -; FVW2-NEXT: [[TMP36:%.*]] = getelementptr float, float* [[TMP32]], i64 4 -; FVW2-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <2 x float>* -; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP30]], <2 x float>* [[TMP37]], i32 4, <2 x i1> [[TMP10]]) -; FVW2-NEXT: [[TMP38:%.*]] = getelementptr float, float* [[TMP32]], i64 6 -; FVW2-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <2 x float>* -; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP31]], <2 x float>* [[TMP39]], i32 4, <2 x i1> [[TMP11]]) -; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 8 -; FVW2-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; FVW2-NEXT: br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; FVW2-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer +; FVW2-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[INDEX:%.*]], i64 [[INDEX7]] +; FVW2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>* +; FVW2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP4]], i32 4, <2 x i1> [[TMP2]], <2 x i32> poison) +; FVW2-NEXT: [[TMP5:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64> +; FVW2-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 +; FVW2-NEXT: br i1 [[TMP6]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; FVW2: pred.load.if: +; FVW2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], i64 [[TMP7]] +; FVW2-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4 +; FVW2-NEXT: [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i32 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE]] +; FVW2: pred.load.continue: +; FVW2-NEXT: [[TMP11:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_LOAD_IF]] ] +; FVW2-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; FVW2-NEXT: br i1 [[TMP12]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]] +; FVW2: pred.load.if8: +; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[TMP13]] +; FVW2-NEXT: [[TMP15:%.*]] = load float, float* [[TMP14]], align 4 +; FVW2-NEXT: [[TMP16:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP15]], i32 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE9]] +; FVW2: pred.load.continue9: +; FVW2-NEXT: [[TMP17:%.*]] = phi <2 x float> [ [[TMP11]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF8]] ] +; FVW2-NEXT: [[TMP18:%.*]] = fadd <2 x float> [[TMP17]], +; FVW2-NEXT: [[TMP19:%.*]] = getelementptr float, float* [[OUT:%.*]], i64 [[INDEX7]] +; FVW2-NEXT: [[TMP20:%.*]] = bitcast float* [[TMP19]] to <2 x float>* +; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP18]], <2 x float>* [[TMP20]], i32 4, <2 x i1> [[TMP2]]) +; FVW2-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX7]], 2 +; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]] +; FVW2-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <2 x i32>* +; FVW2-NEXT: [[WIDE_LOAD_1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP22]], align 4 +; FVW2-NEXT: [[TMP23:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_1]], zeroinitializer +; FVW2-NEXT: [[TMP24:%.*]] = getelementptr i32, i32* [[INDEX]], i64 [[INDEX_NEXT]] +; FVW2-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <2 x i32>* +; FVW2-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP25]], i32 4, <2 x i1> [[TMP23]], <2 x i32> poison) +; FVW2-NEXT: [[TMP26:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_1]] to <2 x i64> +; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP23]], i32 0 +; FVW2-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF_1:%.*]], label [[PRED_LOAD_CONTINUE_1:%.*]] +; FVW2: pred.load.if.1: +; FVW2-NEXT: [[TMP28:%.*]] = extractelement <2 x i64> [[TMP26]], i32 0 +; FVW2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[TMP28]] +; FVW2-NEXT: [[TMP30:%.*]] = load float, float* [[TMP29]], align 4 +; FVW2-NEXT: [[TMP31:%.*]] = insertelement <2 x float> poison, float [[TMP30]], i32 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE_1]] +; FVW2: pred.load.continue.1: +; FVW2-NEXT: [[TMP32:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP31]], [[PRED_LOAD_IF_1]] ] +; FVW2-NEXT: [[TMP33:%.*]] = extractelement <2 x i1> [[TMP23]], i32 1 +; FVW2-NEXT: br i1 [[TMP33]], label [[PRED_LOAD_IF8_1:%.*]], label [[PRED_LOAD_CONTINUE9_1]] +; FVW2: pred.load.if8.1: +; FVW2-NEXT: [[TMP34:%.*]] = extractelement <2 x i64> [[TMP26]], i32 1 +; FVW2-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[TMP34]] +; FVW2-NEXT: [[TMP36:%.*]] = load float, float* [[TMP35]], align 4 +; FVW2-NEXT: [[TMP37:%.*]] = insertelement <2 x float> [[TMP32]], float [[TMP36]], i32 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE9_1]] +; FVW2: pred.load.continue9.1: +; FVW2-NEXT: [[TMP38:%.*]] = phi <2 x float> [ [[TMP32]], [[PRED_LOAD_CONTINUE_1]] ], [ [[TMP37]], [[PRED_LOAD_IF8_1]] ] +; FVW2-NEXT: [[TMP39:%.*]] = fadd <2 x float> [[TMP38]], +; FVW2-NEXT: [[TMP40:%.*]] = getelementptr float, float* [[OUT]], i64 [[INDEX_NEXT]] +; FVW2-NEXT: [[TMP41:%.*]] = bitcast float* [[TMP40]] to <2 x float>* +; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP39]], <2 x float>* [[TMP41]], i32 4, <2 x i1> [[TMP23]]) +; FVW2-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX7]], 4 +; FVW2-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 4096 +; FVW2-NEXT: br i1 [[TMP42]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; @@ -365,8 +374,7 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] -; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ] +; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE11:%.*]] ] ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16 ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] @@ -376,29 +384,44 @@ ; FVW2-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0 ; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i32 1 ; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[TMP6]], zeroinitializer -; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 -; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) -; FVW2-NEXT: [[TMP9:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], -; FVW2-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 -; FVW2-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; FVW2-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; FVW2-NEXT: br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; FVW2: pred.load.if: +; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], i64 [[OFFSET_IDX]], i32 1 +; FVW2-NEXT: [[TMP10:%.*]] = load float, float* [[TMP9]], align 4 +; FVW2-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i32 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE]] +; FVW2: pred.load.continue: +; FVW2-NEXT: [[TMP12:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ] +; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]] +; FVW2: pred.load.if8: +; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP0]], i32 1 +; FVW2-NEXT: [[TMP15:%.*]] = load float, float* [[TMP14]], align 4 +; FVW2-NEXT: [[TMP16:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP15]], i32 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE9]] +; FVW2: pred.load.continue9: +; FVW2-NEXT: [[TMP17:%.*]] = phi <2 x float> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF8]] ] +; FVW2-NEXT: [[TMP18:%.*]] = fadd <2 x float> [[TMP17]], +; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; FVW2: pred.store.if: -; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]] -; FVW2-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 0 -; FVW2-NEXT: store float [[TMP12]], float* [[TMP11]], align 4 +; FVW2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]] +; FVW2-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[TMP18]], i32 0 +; FVW2-NEXT: store float [[TMP21]], float* [[TMP20]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] ; FVW2: pred.store.continue: -; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 -; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]] -; FVW2: pred.store.if8: -; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP0]] -; FVW2-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 -; FVW2-NEXT: store float [[TMP15]], float* [[TMP14]], align 4 -; FVW2-NEXT: br label [[PRED_STORE_CONTINUE9]] -; FVW2: pred.store.continue9: +; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; FVW2-NEXT: br i1 [[TMP22]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11]] +; FVW2: pred.store.if10: +; FVW2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP0]] +; FVW2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[TMP18]], i32 1 +; FVW2-NEXT: store float [[TMP24]], float* [[TMP23]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE11]] +; FVW2: pred.store.continue11: ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2 -; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], -; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; FVW2-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; FVW2-NEXT: br i1 [[TMP25]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; @@ -610,8 +633,7 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ] -; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ] +; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE10:%.*]] ] ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16 ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] @@ -621,29 +643,44 @@ ; FVW2-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0 ; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i32 1 ; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[TMP6]], zeroinitializer -; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 -; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) -; FVW2-NEXT: [[TMP9:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], -; FVW2-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 -; FVW2-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; FVW2-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; FVW2-NEXT: br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; FVW2: pred.load.if: +; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], i64 [[OFFSET_IDX]], i32 1 +; FVW2-NEXT: [[TMP10:%.*]] = load float, float* [[TMP9]], align 4 +; FVW2-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i32 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE]] +; FVW2: pred.load.continue: +; FVW2-NEXT: [[TMP12:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ] +; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] +; FVW2: pred.load.if7: +; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP0]], i32 1 +; FVW2-NEXT: [[TMP15:%.*]] = load float, float* [[TMP14]], align 4 +; FVW2-NEXT: [[TMP16:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP15]], i32 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE8]] +; FVW2: pred.load.continue8: +; FVW2-NEXT: [[TMP17:%.*]] = phi <2 x float> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF7]] ] +; FVW2-NEXT: [[TMP18:%.*]] = fadd <2 x float> [[TMP17]], +; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; FVW2: pred.store.if: -; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], i64 [[OFFSET_IDX]], i32 1 -; FVW2-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 0 -; FVW2-NEXT: store float [[TMP12]], float* [[TMP11]], align 4 +; FVW2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], i64 [[OFFSET_IDX]], i32 1 +; FVW2-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[TMP18]], i32 0 +; FVW2-NEXT: store float [[TMP21]], float* [[TMP20]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] ; FVW2: pred.store.continue: -; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 -; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]] -; FVW2: pred.store.if7: -; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP0]], i32 1 -; FVW2-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 -; FVW2-NEXT: store float [[TMP15]], float* [[TMP14]], align 4 -; FVW2-NEXT: br label [[PRED_STORE_CONTINUE8]] -; FVW2: pred.store.continue8: +; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; FVW2-NEXT: br i1 [[TMP22]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10]] +; FVW2: pred.store.if9: +; FVW2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP0]], i32 1 +; FVW2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[TMP18]], i32 1 +; FVW2-NEXT: store float [[TMP24]], float* [[TMP23]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE10]] +; FVW2: pred.store.continue10: ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], -; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; FVW2-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; FVW2-NEXT: br i1 [[TMP25]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; @@ -841,8 +878,7 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] -; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ] +; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE11:%.*]] ] ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16 ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] @@ -852,29 +888,44 @@ ; FVW2-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0 ; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i32 1 ; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[TMP6]], zeroinitializer -; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 -; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP8]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) -; FVW2-NEXT: [[TMP9:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], -; FVW2-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 -; FVW2-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; FVW2-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; FVW2-NEXT: br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; FVW2: pred.load.if: +; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], i64 [[OFFSET_IDX]], i32 1 +; FVW2-NEXT: [[TMP10:%.*]] = load float, float addrspace(1)* [[TMP9]], align 4 +; FVW2-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i32 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE]] +; FVW2: pred.load.continue: +; FVW2-NEXT: [[TMP12:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ] +; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]] +; FVW2: pred.load.if8: +; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP0]], i32 1 +; FVW2-NEXT: [[TMP15:%.*]] = load float, float addrspace(1)* [[TMP14]], align 4 +; FVW2-NEXT: [[TMP16:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP15]], i32 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE9]] +; FVW2: pred.load.continue9: +; FVW2-NEXT: [[TMP17:%.*]] = phi <2 x float> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF8]] ] +; FVW2-NEXT: [[TMP18:%.*]] = fadd <2 x float> [[TMP17]], +; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; FVW2: pred.store.if: -; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]] -; FVW2-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 0 -; FVW2-NEXT: store float [[TMP12]], float addrspace(1)* [[TMP11]], align 4 +; FVW2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]] +; FVW2-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[TMP18]], i32 0 +; FVW2-NEXT: store float [[TMP21]], float addrspace(1)* [[TMP20]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] ; FVW2: pred.store.continue: -; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 -; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]] -; FVW2: pred.store.if8: -; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP0]] -; FVW2-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 -; FVW2-NEXT: store float [[TMP15]], float addrspace(1)* [[TMP14]], align 4 -; FVW2-NEXT: br label [[PRED_STORE_CONTINUE9]] -; FVW2: pred.store.continue9: +; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; FVW2-NEXT: br i1 [[TMP22]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11]] +; FVW2: pred.store.if10: +; FVW2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP0]] +; FVW2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[TMP18]], i32 1 +; FVW2-NEXT: store float [[TMP24]], float addrspace(1)* [[TMP23]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE11]] +; FVW2: pred.store.continue11: ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2 -; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], -; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; FVW2-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; FVW2-NEXT: br i1 [[TMP25]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; @@ -1072,8 +1123,7 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] -; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ] +; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE11:%.*]] ] ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16 ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] @@ -1083,29 +1133,44 @@ ; FVW2-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0 ; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i32 1 ; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[TMP6]], zeroinitializer -; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 -; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP8]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) -; FVW2-NEXT: [[TMP9:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], -; FVW2-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 -; FVW2-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; FVW2-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; FVW2-NEXT: br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; FVW2: pred.load.if: +; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], i64 [[OFFSET_IDX]], i32 1 +; FVW2-NEXT: [[TMP10:%.*]] = load float, float addrspace(1)* [[TMP9]], align 4 +; FVW2-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i32 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE]] +; FVW2: pred.load.continue: +; FVW2-NEXT: [[TMP12:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ] +; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]] +; FVW2: pred.load.if8: +; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP0]], i32 1 +; FVW2-NEXT: [[TMP15:%.*]] = load float, float addrspace(1)* [[TMP14]], align 4 +; FVW2-NEXT: [[TMP16:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP15]], i32 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE9]] +; FVW2: pred.load.continue9: +; FVW2-NEXT: [[TMP17:%.*]] = phi <2 x float> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF8]] ] +; FVW2-NEXT: [[TMP18:%.*]] = fadd <2 x float> [[TMP17]], +; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; FVW2: pred.store.if: -; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]] -; FVW2-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 0 -; FVW2-NEXT: store float [[TMP12]], float* [[TMP11]], align 4 +; FVW2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]] +; FVW2-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[TMP18]], i32 0 +; FVW2-NEXT: store float [[TMP21]], float* [[TMP20]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] ; FVW2: pred.store.continue: -; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 -; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]] -; FVW2: pred.store.if8: -; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP0]] -; FVW2-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 -; FVW2-NEXT: store float [[TMP15]], float* [[TMP14]], align 4 -; FVW2-NEXT: br label [[PRED_STORE_CONTINUE9]] -; FVW2: pred.store.continue9: +; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; FVW2-NEXT: br i1 [[TMP22]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11]] +; FVW2: pred.store.if10: +; FVW2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP0]] +; FVW2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[TMP18]], i32 1 +; FVW2-NEXT: store float [[TMP24]], float* [[TMP23]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE11]] +; FVW2: pred.store.continue11: ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2 -; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], -; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; FVW2-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; FVW2-NEXT: br i1 [[TMP25]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; @@ -1303,8 +1368,7 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] -; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ] +; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE11:%.*]] ] ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16 ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] @@ -1314,29 +1378,44 @@ ; FVW2-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0 ; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i32 1 ; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[TMP6]], zeroinitializer -; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 -; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) -; FVW2-NEXT: [[TMP9:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], -; FVW2-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 -; FVW2-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; FVW2-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; FVW2-NEXT: br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; FVW2: pred.load.if: +; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], i64 [[OFFSET_IDX]], i32 1 +; FVW2-NEXT: [[TMP10:%.*]] = load float, float* [[TMP9]], align 4 +; FVW2-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i32 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE]] +; FVW2: pred.load.continue: +; FVW2-NEXT: [[TMP12:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ] +; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]] +; FVW2: pred.load.if8: +; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP0]], i32 1 +; FVW2-NEXT: [[TMP15:%.*]] = load float, float* [[TMP14]], align 4 +; FVW2-NEXT: [[TMP16:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP15]], i32 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE9]] +; FVW2: pred.load.continue9: +; FVW2-NEXT: [[TMP17:%.*]] = phi <2 x float> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF8]] ] +; FVW2-NEXT: [[TMP18:%.*]] = fadd <2 x float> [[TMP17]], +; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; FVW2: pred.store.if: -; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]] -; FVW2-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 0 -; FVW2-NEXT: store float [[TMP12]], float addrspace(1)* [[TMP11]], align 4 +; FVW2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]] +; FVW2-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[TMP18]], i32 0 +; FVW2-NEXT: store float [[TMP21]], float addrspace(1)* [[TMP20]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] ; FVW2: pred.store.continue: -; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 -; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]] -; FVW2: pred.store.if8: -; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP0]] -; FVW2-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 -; FVW2-NEXT: store float [[TMP15]], float addrspace(1)* [[TMP14]], align 4 -; FVW2-NEXT: br label [[PRED_STORE_CONTINUE9]] -; FVW2: pred.store.continue9: +; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; FVW2-NEXT: br i1 [[TMP22]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11]] +; FVW2: pred.store.if10: +; FVW2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP0]] +; FVW2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[TMP18]], i32 1 +; FVW2-NEXT: store float [[TMP24]], float addrspace(1)* [[TMP23]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE11]] +; FVW2: pred.store.continue11: ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2 -; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], -; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; FVW2-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; FVW2-NEXT: br i1 [[TMP25]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; FVW2: for.end: ; FVW2-NEXT: ret void ;