Index: llvm/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -664,6 +664,12 @@
   bool isLegalMaskedScatter(Type *DataType, Align Alignment) const;
   /// Return true if the target supports masked gather.
   bool isLegalMaskedGather(Type *DataType, Align Alignment) const;
+  /// Return true if the target forces scalarizing of llvm.masked.gather
+  /// intrinsics.
+  bool forceScalarizeMaskedGather(Type *Type, Align Alignment) const;
+  /// Return true if the target forces scalarizing of llvm.masked.scatter
+  /// intrinsics.
+  bool forceScalarizeMaskedScatter(Type *Type, Align Alignment) const;
 
   /// Return true if the target supports masked compress store.
   bool isLegalMaskedCompressStore(Type *DataType) const;
@@ -1543,6 +1549,8 @@
   virtual bool isLegalNTLoad(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalMaskedScatter(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalMaskedGather(Type *DataType, Align Alignment) = 0;
+  virtual bool forceScalarizeMaskedGather(Type *DataType, Align Alignment) = 0;
+  virtual bool forceScalarizeMaskedScatter(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalMaskedCompressStore(Type *DataType) = 0;
   virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0;
   virtual bool enableOrderedReductions() = 0;
@@ -1945,6 +1953,12 @@
   bool isLegalMaskedGather(Type *DataType, Align Alignment) override {
     return Impl.isLegalMaskedGather(DataType, Alignment);
   }
+  bool forceScalarizeMaskedGather(Type *DataType, Align Alignment) override {
+    return Impl.forceScalarizeMaskedGather(DataType, Alignment);
+  }
+  bool forceScalarizeMaskedScatter(Type *DataType, Align Alignment) override {
+    return Impl.forceScalarizeMaskedScatter(DataType, Alignment);
+  }
   bool isLegalMaskedCompressStore(Type *DataType) override {
     return Impl.isLegalMaskedCompressStore(DataType);
   }
Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -267,6 +267,14 @@
     return false;
   }
 
+  bool forceScalarizeMaskedGather(Type *DataType, Align Alignment) const {
+    return false;
+  }
+
+  bool forceScalarizeMaskedScatter(Type *DataType, Align Alignment) const {
+    return false;
+  }
+
   bool isLegalMaskedCompressStore(Type *DataType) const { return false; }
 
   bool isLegalMaskedExpandLoad(Type *DataType) const { return false; }
Index: llvm/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -408,6 +408,16 @@
   return TTIImpl->isLegalMaskedScatter(DataType, Alignment);
 }
 
+bool TargetTransformInfo::forceScalarizeMaskedGather(Type *DataType,
+                                                     Align Alignment) const {
+  return TTIImpl->forceScalarizeMaskedGather(DataType, Alignment);
+}
+
+bool TargetTransformInfo::forceScalarizeMaskedScatter(Type *DataType,
+                                                      Align Alignment) const {
+  return TTIImpl->forceScalarizeMaskedScatter(DataType, Alignment);
+}
+
 bool TargetTransformInfo::isLegalMaskedCompressStore(Type *DataType) const {
   return TTIImpl->isLegalMaskedCompressStore(DataType);
 }
Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h
===================================================================
--- llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -189,6 +189,18 @@
     return isLegalMaskedLoad(DataTy, Alignment);
   }
 
+  bool forceScalarizeMaskedGather(Type *VTy, Align Alignment) {
+    // For MVE, we have a custom lowering pass that will already have custom
+    // legalised any gathers that we can to MVE intrinsics, and want to expand
+    // all the rest. The pass runs before the masked intrinsic lowering pass, so
+    // if we are here, we know we want to expand.
+    return true;
+  }
+
+  bool forceScalarizeMaskedScatter(Type *VTy, Align Alignment) {
+    return forceScalarizeMaskedGather(VTy, Alignment);
+  }
+
   bool isLegalMaskedGather(Type *Ty, Align Alignment);
 
   bool isLegalMaskedScatter(Type *Ty, Align Alignment) {
Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1116,18 +1116,6 @@
   if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
     return false;
 
-  // This method is called in 2 places:
-  //  - from the vectorizer with a scalar type, in which case we need to get
-  //  this as good as we can with the limited info we have (and rely on the cost
-  //  model for the rest).
-  //  - from the masked intrinsic lowering pass with the actual vector type.
-  // For MVE, we have a custom lowering pass that will already have custom
-  // legalised any gathers that we can to MVE intrinsics, and want to expand all
-  // the rest. The pass runs before the masked intrinsic lowering pass, so if we
-  // are here, we know we want to expand.
-  if (isa<VectorType>(Ty))
-    return false;
-
   unsigned EltWidth = Ty->getScalarSizeInBits();
   return ((EltWidth == 32 && Alignment >= 4) ||
           (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
Index: llvm/lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -5120,16 +5120,9 @@
     return false;
 
   // This function is called now in two cases: from the Loop Vectorizer
-  // and from the Scalarizer.
-  // When the Loop Vectorizer asks about legality of the feature,
-  // the vectorization factor is not calculated yet. The Loop Vectorizer
-  // sends a scalar type and the decision is based on the width of the
-  // scalar element.
-  // Later on, the cost model will estimate usage this intrinsic based on
-  // the vector type.
-  // The Scalarizer asks again about legality. It sends a vector type.
-  // In this case we can reject non-power-of-2 vectors.
-  // We also reject single element vectors as the type legalizer can't
+  // and from the Scalarizer to ask about legality of the feature.
+  // In the case of a vector type being sent we can reject non-power-of-2
+  // vectors. We also reject single element vectors as the type legalizer can't
   // scalarize it.
   if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) {
     unsigned NumElts = DataVTy->getNumElements();
Index: llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
+++ llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
@@ -959,7 +959,8 @@
       Type *LoadTy = CI->getType();
       Align Alignment = DL.getValueOrABITypeAlignment(MA,
                                                       LoadTy->getScalarType());
-      if (TTI.isLegalMaskedGather(LoadTy, Alignment))
+      if (TTI.isLegalMaskedGather(LoadTy, Alignment) &&
+          !TTI.forceScalarizeMaskedGather(LoadTy, Alignment))
         return false;
       scalarizeMaskedGather(DL, CI, DTU, ModifiedDT);
       return true;
@@ -970,7 +971,8 @@
       Type *StoreTy = CI->getArgOperand(0)->getType();
       Align Alignment = DL.getValueOrABITypeAlignment(MA,
                                                       StoreTy->getScalarType());
-      if (TTI.isLegalMaskedScatter(StoreTy, Alignment))
+      if (TTI.isLegalMaskedScatter(StoreTy, Alignment) &&
+          !TTI.forceScalarizeMaskedScatter(StoreTy, Alignment))
         return false;
       scalarizeMaskedScatter(DL, CI, DTU, ModifiedDT);
       return true;
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1577,13 +1577,16 @@
 
   /// Returns true if the target machine can represent \p V as a masked gather
   /// or scatter operation.
-  bool isLegalGatherOrScatter(Value *V) {
+  bool isLegalGatherOrScatter(Value *V,
+                              ElementCount VF = ElementCount::getFixed(1)) {
     bool LI = isa<LoadInst>(V);
     bool SI = isa<StoreInst>(V);
     if (!LI && !SI)
       return false;
     auto *Ty = getLoadStoreType(V);
     Align Align = getLoadStoreAlignment(V);
+    if (VF.isVector())
+      Ty = VectorType::get(Ty, VF);
     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
            (SI && TTI.isLegalMaskedScatter(Ty, Align));
   }
@@ -7226,7 +7229,7 @@
         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
         InstructionCost Cost;
         if (isa<StoreInst>(&I) && VF.isScalable() &&
-            isLegalGatherOrScatter(&I)) {
+            isLegalGatherOrScatter(&I, VF)) {
           Cost = getGatherScatterCost(&I, VF);
           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
         } else {
@@ -7268,7 +7271,7 @@
       }
 
       InstructionCost GatherScatterCost =
-          isLegalGatherOrScatter(&I)
+          isLegalGatherOrScatter(&I, VF)
               ? getGatherScatterCost(&I, VF) * NumAccesses
               : InstructionCost::getInvalid();
 
Index: llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll
+++ llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll
@@ -50,8 +50,8 @@
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 48 for VF 32 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; AVX512: LV: Found an estimated cost of 10 for VF 2 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; AVX512: LV: Found an estimated cost of 22 for VF 4 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; AVX512: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; AVX512: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
 ; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
 ; AVX512: LV: Found an estimated cost of 18 for VF 16 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
 ; AVX512: LV: Found an estimated cost of 36 for VF 32 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
Index: llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll
+++ llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll
@@ -50,8 +50,8 @@
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 48 for VF 32 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; AVX512: LV: Found an estimated cost of 10 for VF 2 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; AVX512: LV: Found an estimated cost of 24 for VF 4 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; AVX512: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; AVX512: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
 ; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
 ; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
 ; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
Index: llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
@@ -90,4 +90,4 @@
   ret void
 }
 
-attributes #0 = { "target-features"="+neon,+sve,+v8.1a" }
+attributes #0 = { "target-features"="+neon,+sve,+v8.1a" vscale_range(2, 0) }
Index: llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -95,66 +95,75 @@
 ; FVW2-NEXT:  entry:
 ; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FVW2:       vector.body:
-; FVW2-NEXT:    [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FVW2-NEXT:    [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_1:%.*]], [[PRED_LOAD_CONTINUE9_1:%.*]] ]
 ; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX7]]
 ; FVW2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>*
 ; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
-; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; FVW2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; FVW2-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
-; FVW2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
-; FVW2-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>*
-; FVW2-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4
-; FVW2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; FVW2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>*
-; FVW2-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP7]], align 4
-; FVW2-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
-; FVW2-NEXT:    [[TMP9:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD8]], zeroinitializer
-; FVW2-NEXT:    [[TMP10:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD9]], zeroinitializer
-; FVW2-NEXT:    [[TMP11:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD10]], zeroinitializer
-; FVW2-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[INDEX:%.*]], i64 [[INDEX7]]
-; FVW2-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
-; FVW2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP13]], i32 4, <2 x i1> [[TMP8]], <2 x i32> poison)
-; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i64 2
-; FVW2-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>*
-; FVW2-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP15]], i32 4, <2 x i1> [[TMP9]], <2 x i32> poison)
-; FVW2-NEXT:    [[TMP16:%.*]] = getelementptr i32, i32* [[TMP12]], i64 4
-; FVW2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>*
-; FVW2-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP17]], i32 4, <2 x i1> [[TMP10]], <2 x i32> poison)
-; FVW2-NEXT:    [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 6
-; FVW2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <2 x i32>*
-; FVW2-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP19]], i32 4, <2 x i1> [[TMP11]], <2 x i32> poison)
-; FVW2-NEXT:    [[TMP20:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64>
-; FVW2-NEXT:    [[TMP21:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD11]] to <2 x i64>
-; FVW2-NEXT:    [[TMP22:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD12]] to <2 x i64>
-; FVW2-NEXT:    [[TMP23:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD13]] to <2 x i64>
-; FVW2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <2 x i64> [[TMP20]]
-; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP21]]
-; FVW2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP22]]
-; FVW2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP23]]
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP24]], i32 4, <2 x i1> [[TMP8]], <2 x float> undef)
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP25]], i32 4, <2 x i1> [[TMP9]], <2 x float> undef)
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP26]], i32 4, <2 x i1> [[TMP10]], <2 x float> undef)
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP27]], i32 4, <2 x i1> [[TMP11]], <2 x float> undef)
-; FVW2-NEXT:    [[TMP28:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP29:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP30:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP31:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP32:%.*]] = getelementptr float, float* [[OUT:%.*]], i64 [[INDEX7]]
-; FVW2-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <2 x float>*
-; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP28]], <2 x float>* [[TMP33]], i32 4, <2 x i1> [[TMP8]])
-; FVW2-NEXT:    [[TMP34:%.*]] = getelementptr float, float* [[TMP32]], i64 2
-; FVW2-NEXT:    [[TMP35:%.*]] = bitcast float* [[TMP34]] to <2 x float>*
-; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP29]], <2 x float>* [[TMP35]], i32 4, <2 x i1> [[TMP9]])
-; FVW2-NEXT:    [[TMP36:%.*]] = getelementptr float, float* [[TMP32]], i64 4
-; FVW2-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <2 x float>*
-; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP30]], <2 x float>* [[TMP37]], i32 4, <2 x i1> [[TMP10]])
-; FVW2-NEXT:    [[TMP38:%.*]] = getelementptr float, float* [[TMP32]], i64 6
-; FVW2-NEXT:    [[TMP39:%.*]] = bitcast float* [[TMP38]] to <2 x float>*
-; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP31]], <2 x float>* [[TMP39]], i32 4, <2 x i1> [[TMP11]])
-; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 8
-; FVW2-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; FVW2-NEXT:    br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; FVW2-NEXT:    [[TMP2:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr i32, i32* [[INDEX:%.*]], i64 [[INDEX7]]
+; FVW2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP4]], i32 4, <2 x i1> [[TMP2]], <2 x i32> poison)
+; FVW2-NEXT:    [[TMP5:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64>
+; FVW2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; FVW2-NEXT:    br i1 [[TMP6]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; FVW2:       pred.load.if:
+; FVW2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], i64 [[TMP7]]
+; FVW2-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4
+; FVW2-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i32 0
+; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; FVW2:       pred.load.continue:
+; FVW2-NEXT:    [[TMP11:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_LOAD_IF]] ]
+; FVW2-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; FVW2-NEXT:    br i1 [[TMP12]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]]
+; FVW2:       pred.load.if8:
+; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[TMP13]]
+; FVW2-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP14]], align 4
+; FVW2-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP15]], i32 1
+; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE9]]
+; FVW2:       pred.load.continue9:
+; FVW2-NEXT:    [[TMP17:%.*]] = phi <2 x float> [ [[TMP11]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF8]] ]
+; FVW2-NEXT:    [[TMP18:%.*]] = fadd <2 x float> [[TMP17]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP19:%.*]] = getelementptr float, float* [[OUT:%.*]], i64 [[INDEX7]]
+; FVW2-NEXT:    [[TMP20:%.*]] = bitcast float* [[TMP19]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP18]], <2 x float>* [[TMP20]], i32 4, <2 x i1> [[TMP2]])
+; FVW2-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX7]], 2
+; FVW2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; FVW2-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP22]], align 4
+; FVW2-NEXT:    [[TMP23:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_1]], zeroinitializer
+; FVW2-NEXT:    [[TMP24:%.*]] = getelementptr i32, i32* [[INDEX]], i64 [[INDEX_NEXT]]
+; FVW2-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP25]], i32 4, <2 x i1> [[TMP23]], <2 x i32> poison)
+; FVW2-NEXT:    [[TMP26:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_1]] to <2 x i64>
+; FVW2-NEXT:    [[TMP27:%.*]] = extractelement <2 x i1> [[TMP23]], i32 0
+; FVW2-NEXT:    br i1 [[TMP27]], label [[PRED_LOAD_IF_1:%.*]], label [[PRED_LOAD_CONTINUE_1:%.*]]
+; FVW2:       pred.load.if.1:
+; FVW2-NEXT:    [[TMP28:%.*]] = extractelement <2 x i64> [[TMP26]], i32 0
+; FVW2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[TMP28]]
+; FVW2-NEXT:    [[TMP30:%.*]] = load float, float* [[TMP29]], align 4
+; FVW2-NEXT:    [[TMP31:%.*]] = insertelement <2 x float> poison, float [[TMP30]], i32 0
+; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE_1]]
+; FVW2:       pred.load.continue.1:
+; FVW2-NEXT:    [[TMP32:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP31]], [[PRED_LOAD_IF_1]] ]
+; FVW2-NEXT:    [[TMP33:%.*]] = extractelement <2 x i1> [[TMP23]], i32 1
+; FVW2-NEXT:    br i1 [[TMP33]], label [[PRED_LOAD_IF8_1:%.*]], label [[PRED_LOAD_CONTINUE9_1]]
+; FVW2:       pred.load.if8.1:
+; FVW2-NEXT:    [[TMP34:%.*]] = extractelement <2 x i64> [[TMP26]], i32 1
+; FVW2-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[TMP34]]
+; FVW2-NEXT:    [[TMP36:%.*]] = load float, float* [[TMP35]], align 4
+; FVW2-NEXT:    [[TMP37:%.*]] = insertelement <2 x float> [[TMP32]], float [[TMP36]], i32 1
+; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE9_1]]
+; FVW2:       pred.load.continue9.1:
+; FVW2-NEXT:    [[TMP38:%.*]] = phi <2 x float> [ [[TMP32]], [[PRED_LOAD_CONTINUE_1]] ], [ [[TMP37]], [[PRED_LOAD_IF8_1]] ]
+; FVW2-NEXT:    [[TMP39:%.*]] = fadd <2 x float> [[TMP38]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP40:%.*]] = getelementptr float, float* [[OUT]], i64 [[INDEX_NEXT]]
+; FVW2-NEXT:    [[TMP41:%.*]] = bitcast float* [[TMP40]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP39]], <2 x float>* [[TMP41]], i32 4, <2 x i1> [[TMP23]])
+; FVW2-NEXT:    [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX7]], 4
+; FVW2-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 4096
+; FVW2-NEXT:    br i1 [[TMP42]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; FVW2:       for.end:
 ; FVW2-NEXT:    ret void
 ;
@@ -365,8 +374,7 @@
 ; FVW2-NEXT:  entry:
 ; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FVW2:       vector.body:
-; FVW2-NEXT:    [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ]
-; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ]
+; FVW2-NEXT:    [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE11:%.*]] ]
 ; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4
 ; FVW2-NEXT:    [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16
 ; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]]
@@ -376,29 +384,44 @@
 ; FVW2-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0
 ; FVW2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i32 1
 ; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[TMP6]], zeroinitializer
-; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
-; FVW2-NEXT:    [[TMP9:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
-; FVW2-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; FVW2:       pred.load.if:
+; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], i64 [[OFFSET_IDX]], i32 1
+; FVW2-NEXT:    [[TMP10:%.*]] = load float, float* [[TMP9]], align 4
+; FVW2-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i32 0
+; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; FVW2:       pred.load.continue:
+; FVW2-NEXT:    [[TMP12:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
+; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]]
+; FVW2:       pred.load.if8:
+; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP0]], i32 1
+; FVW2-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP14]], align 4
+; FVW2-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP15]], i32 1
+; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE9]]
+; FVW2:       pred.load.continue9:
+; FVW2-NEXT:    [[TMP17:%.*]] = phi <2 x float> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF8]] ]
+; FVW2-NEXT:    [[TMP18:%.*]] = fadd <2 x float> [[TMP17]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; FVW2:       pred.store.if:
-; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]]
-; FVW2-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
-; FVW2-NEXT:    store float [[TMP12]], float* [[TMP11]], align 4
+; FVW2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP21:%.*]] = extractelement <2 x float> [[TMP18]], i32 0
+; FVW2-NEXT:    store float [[TMP21]], float* [[TMP20]], align 4
 ; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; FVW2:       pred.store.continue:
-; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
-; FVW2-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]]
-; FVW2:       pred.store.if8:
-; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP0]]
-; FVW2-NEXT:    [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
-; FVW2-NEXT:    store float [[TMP15]], float* [[TMP14]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE9]]
-; FVW2:       pred.store.continue9:
+; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP22]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11]]
+; FVW2:       pred.store.if10:
+; FVW2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP0]]
+; FVW2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[TMP18]], i32 1
+; FVW2-NEXT:    store float [[TMP24]], float* [[TMP23]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE11]]
+; FVW2:       pred.store.continue11:
 ; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2
-; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
-; FVW2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; FVW2-NEXT:    br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; FVW2-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP25]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; FVW2:       for.end:
 ; FVW2-NEXT:    ret void
 ;
@@ -610,8 +633,7 @@
 ; FVW2-NEXT:  entry:
 ; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FVW2:       vector.body:
-; FVW2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
-; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ]
+; FVW2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE10:%.*]] ]
 ; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 4
 ; FVW2-NEXT:    [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16
 ; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]]
@@ -621,29 +643,44 @@
 ; FVW2-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0
 ; FVW2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i32 1
 ; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[TMP6]], zeroinitializer
-; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
-; FVW2-NEXT:    [[TMP9:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
-; FVW2-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; FVW2:       pred.load.if:
+; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], i64 [[OFFSET_IDX]], i32 1
+; FVW2-NEXT:    [[TMP10:%.*]] = load float, float* [[TMP9]], align 4
+; FVW2-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i32 0
+; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; FVW2:       pred.load.continue:
+; FVW2-NEXT:    [[TMP12:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
+; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
+; FVW2:       pred.load.if7:
+; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP0]], i32 1
+; FVW2-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP14]], align 4
+; FVW2-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP15]], i32 1
+; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
+; FVW2:       pred.load.continue8:
+; FVW2-NEXT:    [[TMP17:%.*]] = phi <2 x float> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF7]] ]
+; FVW2-NEXT:    [[TMP18:%.*]] = fadd <2 x float> [[TMP17]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; FVW2:       pred.store.if:
-; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], i64 [[OFFSET_IDX]], i32 1
-; FVW2-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
-; FVW2-NEXT:    store float [[TMP12]], float* [[TMP11]], align 4
+; FVW2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], i64 [[OFFSET_IDX]], i32 1
+; FVW2-NEXT:    [[TMP21:%.*]] = extractelement <2 x float> [[TMP18]], i32 0
+; FVW2-NEXT:    store float [[TMP21]], float* [[TMP20]], align 4
 ; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; FVW2:       pred.store.continue:
-; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
-; FVW2-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]]
-; FVW2:       pred.store.if7:
-; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP0]], i32 1
-; FVW2-NEXT:    [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
-; FVW2-NEXT:    store float [[TMP15]], float* [[TMP14]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE8]]
-; FVW2:       pred.store.continue8:
+; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP22]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10]]
+; FVW2:       pred.store.if9:
+; FVW2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP0]], i32 1
+; FVW2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[TMP18]], i32 1
+; FVW2-NEXT:    store float [[TMP24]], float* [[TMP23]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; FVW2:       pred.store.continue10:
 ; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
-; FVW2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; FVW2-NEXT:    br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; FVW2-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP25]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; FVW2:       for.end:
 ; FVW2-NEXT:    ret void
 ;
@@ -841,8 +878,7 @@
 ; FVW2-NEXT:  entry:
 ; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FVW2:       vector.body:
-; FVW2-NEXT:    [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ]
-; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ]
+; FVW2-NEXT:    [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE11:%.*]] ]
 ; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4
 ; FVW2-NEXT:    [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16
 ; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]]
@@ -852,29 +888,44 @@
 ; FVW2-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0
 ; FVW2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i32 1
 ; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[TMP6]], zeroinitializer
-; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP8]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
-; FVW2-NEXT:    [[TMP9:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
-; FVW2-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; FVW2:       pred.load.if:
+; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], i64 [[OFFSET_IDX]], i32 1
+; FVW2-NEXT:    [[TMP10:%.*]] = load float, float addrspace(1)* [[TMP9]], align 4
+; FVW2-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i32 0
+; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; FVW2:       pred.load.continue:
+; FVW2-NEXT:    [[TMP12:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
+; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]]
+; FVW2:       pred.load.if8:
+; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP0]], i32 1
+; FVW2-NEXT:    [[TMP15:%.*]] = load float, float addrspace(1)* [[TMP14]], align 4
+; FVW2-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP15]], i32 1
+; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE9]]
+; FVW2:       pred.load.continue9:
+; FVW2-NEXT:    [[TMP17:%.*]] = phi <2 x float> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF8]] ]
+; FVW2-NEXT:    [[TMP18:%.*]] = fadd <2 x float> [[TMP17]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; FVW2:       pred.store.if:
-; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]]
-; FVW2-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
-; FVW2-NEXT:    store float [[TMP12]], float addrspace(1)* [[TMP11]], align 4
+; FVW2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP21:%.*]] = extractelement <2 x float> [[TMP18]], i32 0
+; FVW2-NEXT:    store float [[TMP21]], float addrspace(1)* [[TMP20]], align 4
 ; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; FVW2:       pred.store.continue:
-; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
-; FVW2-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]]
-; FVW2:       pred.store.if8:
-; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP0]]
-; FVW2-NEXT:    [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
-; FVW2-NEXT:    store float [[TMP15]], float addrspace(1)* [[TMP14]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE9]]
-; FVW2:       pred.store.continue9:
+; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP22]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11]]
+; FVW2:       pred.store.if10:
+; FVW2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP0]]
+; FVW2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[TMP18]], i32 1
+; FVW2-NEXT:    store float [[TMP24]], float addrspace(1)* [[TMP23]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE11]]
+; FVW2:       pred.store.continue11:
 ; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2
-; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
-; FVW2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; FVW2-NEXT:    br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; FVW2-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP25]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; FVW2:       for.end:
 ; FVW2-NEXT:    ret void
 ;
@@ -1072,8 +1123,7 @@
 ; FVW2-NEXT:  entry:
 ; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FVW2:       vector.body:
-; FVW2-NEXT:    [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ]
-; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ]
+; FVW2-NEXT:    [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE11:%.*]] ]
 ; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4
 ; FVW2-NEXT:    [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16
 ; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]]
@@ -1083,29 +1133,44 @@
 ; FVW2-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0
 ; FVW2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i32 1
 ; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[TMP6]], zeroinitializer
-; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP8]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
-; FVW2-NEXT:    [[TMP9:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
-; FVW2-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; FVW2:       pred.load.if:
+; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], i64 [[OFFSET_IDX]], i32 1
+; FVW2-NEXT:    [[TMP10:%.*]] = load float, float addrspace(1)* [[TMP9]], align 4
+; FVW2-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i32 0
+; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; FVW2:       pred.load.continue:
+; FVW2-NEXT:    [[TMP12:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
+; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]]
+; FVW2:       pred.load.if8:
+; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP0]], i32 1
+; FVW2-NEXT:    [[TMP15:%.*]] = load float, float addrspace(1)* [[TMP14]], align 4
+; FVW2-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP15]], i32 1
+; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE9]]
+; FVW2:       pred.load.continue9:
+; FVW2-NEXT:    [[TMP17:%.*]] = phi <2 x float> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF8]] ]
+; FVW2-NEXT:    [[TMP18:%.*]] = fadd <2 x float> [[TMP17]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; FVW2:       pred.store.if:
-; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]]
-; FVW2-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
-; FVW2-NEXT:    store float [[TMP12]], float* [[TMP11]], align 4
+; FVW2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP21:%.*]] = extractelement <2 x float> [[TMP18]], i32 0
+; FVW2-NEXT:    store float [[TMP21]], float* [[TMP20]], align 4
 ; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; FVW2:       pred.store.continue:
-; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
-; FVW2-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]]
-; FVW2:       pred.store.if8:
-; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP0]]
-; FVW2-NEXT:    [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
-; FVW2-NEXT:    store float [[TMP15]], float* [[TMP14]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE9]]
-; FVW2:       pred.store.continue9:
+; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP22]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11]]
+; FVW2:       pred.store.if10:
+; FVW2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP0]]
+; FVW2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[TMP18]], i32 1
+; FVW2-NEXT:    store float [[TMP24]], float* [[TMP23]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE11]]
+; FVW2:       pred.store.continue11:
 ; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2
-; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
-; FVW2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; FVW2-NEXT:    br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; FVW2-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP25]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; FVW2:       for.end:
 ; FVW2-NEXT:    ret void
 ;
@@ -1303,8 +1368,7 @@
 ; FVW2-NEXT:  entry:
 ; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FVW2:       vector.body:
-; FVW2-NEXT:    [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ]
-; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ]
+; FVW2-NEXT:    [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE11:%.*]] ]
 ; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4
 ; FVW2-NEXT:    [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16
 ; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]]
@@ -1314,29 +1378,44 @@
 ; FVW2-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0
 ; FVW2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i32 1
 ; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[TMP6]], zeroinitializer
-; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
-; FVW2-NEXT:    [[TMP9:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
-; FVW2-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; FVW2:       pred.load.if:
+; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], i64 [[OFFSET_IDX]], i32 1
+; FVW2-NEXT:    [[TMP10:%.*]] = load float, float* [[TMP9]], align 4
+; FVW2-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i32 0
+; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; FVW2:       pred.load.continue:
+; FVW2-NEXT:    [[TMP12:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
+; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]]
+; FVW2:       pred.load.if8:
+; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP0]], i32 1
+; FVW2-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP14]], align 4
+; FVW2-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP15]], i32 1
+; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE9]]
+; FVW2:       pred.load.continue9:
+; FVW2-NEXT:    [[TMP17:%.*]] = phi <2 x float> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF8]] ]
+; FVW2-NEXT:    [[TMP18:%.*]] = fadd <2 x float> [[TMP17]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; FVW2:       pred.store.if:
-; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]]
-; FVW2-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
-; FVW2-NEXT:    store float [[TMP12]], float addrspace(1)* [[TMP11]], align 4
+; FVW2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP21:%.*]] = extractelement <2 x float> [[TMP18]], i32 0
+; FVW2-NEXT:    store float [[TMP21]], float addrspace(1)* [[TMP20]], align 4
 ; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; FVW2:       pred.store.continue:
-; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
-; FVW2-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]]
-; FVW2:       pred.store.if8:
-; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP0]]
-; FVW2-NEXT:    [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
-; FVW2-NEXT:    store float [[TMP15]], float addrspace(1)* [[TMP14]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE9]]
-; FVW2:       pred.store.continue9:
+; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP22]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11]]
+; FVW2:       pred.store.if10:
+; FVW2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP0]]
+; FVW2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[TMP18]], i32 1
+; FVW2-NEXT:    store float [[TMP24]], float addrspace(1)* [[TMP23]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE11]]
+; FVW2:       pred.store.continue11:
 ; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2
-; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
-; FVW2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; FVW2-NEXT:    br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; FVW2-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP25]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; FVW2:       for.end:
 ; FVW2-NEXT:    ret void
 ;