Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4201,26 +4201,68 @@
   case InductionDescriptor::IK_PtrInduction: {
     // Handle the pointer induction variable case.
     assert(P->getType()->isPointerTy() && "Unexpected type.");
-    // This is the normalized GEP that starts counting at zero.
-    Value *PtrInd = Induction;
-    PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
-    // Determine the number of scalars we need to generate for each unroll
-    // iteration. If the instruction is uniform, we only need to generate the
-    // first lane. Otherwise, we generate all VF values.
-    unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
-    // These are the scalar results. Notice that we don't generate vector GEPs
-    // because scalar GEPs result in better code.
-    for (unsigned Part = 0; Part < UF; ++Part) {
-      for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
-        Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
-        Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
-        Value *SclrGep =
-            emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
-        SclrGep->setName("next.gep");
-        VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
+
+    if (Cost->isScalarAfterVectorization(P, VF)) {
+      // This is the normalized GEP that starts counting at zero.
+      Value *PtrInd =
+          Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
+      // Determine the number of scalars we need to generate for each unroll
+      // iteration. If the instruction is uniform, we only need to generate the
+      // first lane. Otherwise, we generate all VF values.
+      unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+          Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
+          Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
+          Value *SclrGep =
+              emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
+          SclrGep->setName("next.gep");
+          VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
+        }
       }
+      return;
+    }
+    assert(isa<SCEVConstant>(II.getStep()) &&
+           "Induction step not a SCEV constant!");
+    Type *PhiType = II.getStep()->getType();
+
+    // Build a pointer phi
+    Value *ScalarStartValue = II.getStartValue();
+    Type *ScStValueType = ScalarStartValue->getType();
+    PHINode *NewPointerPhi =
+        PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
+    NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
+
+    // A pointer induction, performed by using a gep
+    BasicBlock *InductionBlock = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
+    Instruction *InductionLoc = InductionBlock->getTerminator();
+    const SCEV *ScalarStep = II.getStep();
+    SCEVExpander Exp(*PSE.getSE(), DL, "induction");
+    Value *ScalarStepValue =
+        Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
+    Value *InductionGEP = GetElementPtrInst::Create(
+        ScStValueType->getPointerElementType(), NewPointerPhi,
+        Builder.CreateMul(ScalarStepValue, ConstantInt::get(PhiType, VF * UF)),
+        "ptr.ind", InductionLoc);
+    NewPointerPhi->addIncoming(InductionGEP, InductionBlock);
+
+    // Create UF many actual address geps that use the pointer
+    // phi as base and a vectorized version of the step value
+    // (<step*0, ..., step*N>) as offset.
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      SmallVector<Constant *, 8> Indices;
+      // Create a vector of consecutive numbers from zero to VF.
+      for (unsigned i = 0; i < VF; ++i)
+        Indices.push_back(ConstantInt::get(PhiType, i + Part * VF));
+      Constant *StartOffset = ConstantVector::get(Indices);
+
+      Value *GEP = Builder.CreateGEP(
+          ScStValueType->getPointerElementType(), NewPointerPhi,
+          Builder.CreateMul(StartOffset,
+                            Builder.CreateVectorSplat(VF, ScalarStepValue),
+                            "vector.gep"));
+      VectorLoopValueMap.setVectorValue(P, Part, GEP);
     }
-    return;
   }
   }
 }
@@ -4456,6 +4498,7 @@
   // accesses that will remain scalar.
   SmallSetVector<Instruction *, 8> ScalarPtrs;
   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
+  auto *Latch = TheLoop->getLoopLatch();
 
   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
   // The pointer operands of loads and stores will be scalar as long as the
@@ -4481,11 +4524,33 @@
            !TheLoop->isLoopInvariant(V);
   };
 
-  // A helper that evaluates a memory access's use of a pointer. If the use
-  // will be a scalar use, and the pointer is only used by memory accesses, we
-  // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
-  // PossibleNonScalarPtrs.
+  auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
+    if (!isa<PHINode>(Ptr) ||
+        !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
+      return false;
+    auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
+    if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
+      return false;
+    return isScalarUse(MemAccess, Ptr);
+  };
+
+  // A helper that evaluates a memory access's use of a pointer. If the
+  // pointer is actually the pointer induction of a loop, it is being
+  // inserted into Worklist. If the use will be a scalar use, and the
+  // pointer is only used by memory accesses, we place the pointer in
+  // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
+    if (isScalarPtrInduction(MemAccess, Ptr)) {
+      Worklist.insert(cast<Instruction>(Ptr));
+      Instruction *Update = cast<Instruction>(
+          cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
+      Worklist.insert(Update);
+      LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
+                        << "\n");
+      LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
+                        << "\n");
+      return;
+    }
     // We only care about bitcast and getelementptr instructions contained in
     // the loop.
     if (!isLoopVaryingBitCastOrGEP(Ptr))
@@ -4509,10 +4574,9 @@
   };
 
   // We seed the scalars analysis with three classes of instructions: (1)
-  // instructions marked uniform-after-vectorization, (2) bitcast and
-  // getelementptr instructions used by memory accesses requiring a scalar use,
-  // and (3) pointer induction variables and their update instructions (we
-  // currently only scalarize these).
+  // instructions marked uniform-after-vectorization and (2) bitcast,
+  // getelementptr and (pointer) phi instructions used by memory accesses
+  // requiring a scalar use.
   //
   // (1) Add to the worklist all instructions that have been identified as
   // uniform-after-vectorization.
@@ -4538,24 +4602,6 @@
       Worklist.insert(I);
     }
 
-  // (3) Add to the worklist all pointer induction variables and their update
-  // instructions.
-  //
-  // TODO: Once we are able to vectorize pointer induction variables we should
-  //       no longer insert them into the worklist here.
-  auto *Latch = TheLoop->getLoopLatch();
-  for (auto &Induction : Legal->getInductionVars()) {
-    auto *Ind = Induction.first;
-    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
-    if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
-      continue;
-    Worklist.insert(Ind);
-    Worklist.insert(IndUpdate);
-    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
-    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
-                      << "\n");
-  }
-
   // Insert the forced scalars.
   // FIXME: Currently widenPHIInstruction() often creates a dead vector
   // induction variable when the PHI user is scalarized.
@@ -4591,14 +4637,6 @@
     auto *Ind = Induction.first;
     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
 
-    // We already considered pointer induction variables, so there's no reason
-    // to look at their users again.
-    //
-    // TODO: Once we are able to vectorize pointer induction variables we
-    //       should no longer skip over them here.
-    if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
-      continue;
-
     // If tail-folding is applied, the primary induction variable will be used
     // to feed a vector compare.
     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
Index: llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
+++ llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
@@ -107,37 +107,25 @@
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[INDEX]], 3
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], 3
-; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], 3
-; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 3
-; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32*> undef, i32* [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32*> [[TMP7]], i32* [[NEXT_GEP4]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32*> [[TMP8]], i32* [[NEXT_GEP5]], i32 2
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32*> [[TMP9]], i32* [[NEXT_GEP6]], i32 3
-; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[B]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP10]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
-; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[NEXT_GEP7]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi i32* [ [[A]], [[ENTRY:%.*]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[NEXT_GEP]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
-; CHECK-NEXT:    br i1 [[TMP13]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
+; CHECK-NEXT:    [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i32 12
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[A_ADDR_09:%.*]] = phi i32* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[B_ADDR_07:%.*]] = phi i32* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[A_ADDR_09]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[A_ADDR_09]], align 4
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i32, i32* [[A_ADDR_09]], i32 3
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[Y]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP4]], [[Y]]
 ; CHECK-NEXT:    store i32 [[ADD]], i32* [[B_ADDR_07]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i32, i32* [[B_ADDR_07]], i32 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
@@ -558,37 +546,25 @@
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[INDEX]], 3
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr float, float* [[A]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], 3
-; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr float, float* [[A]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], 3
-; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr float, float* [[A]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 3
-; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr float, float* [[A]], i32 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float*> undef, float* [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float*> [[TMP7]], float* [[NEXT_GEP4]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float*> [[TMP8]], float* [[NEXT_GEP5]], i32 2
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float*> [[TMP9]], float* [[NEXT_GEP6]], i32 3
-; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr float, float* [[B]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP10]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <4 x float> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast float* [[NEXT_GEP7]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP11]], <4 x float>* [[TMP12]], align 4
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi float* [ [[A]], [[ENTRY:%.*]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr float, float* [[POINTER_PHI]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr float, float* [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast <4 x float> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[NEXT_GEP]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
-; CHECK-NEXT:    br i1 [[TMP13]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !17
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
+; CHECK-NEXT:    [[PTR_IND]] = getelementptr float, float* [[POINTER_PHI]], i32 12
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !17
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[A_ADDR_09:%.*]] = phi float* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[B_ADDR_07:%.*]] = phi float* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = load float, float* [[A_ADDR_09]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[A_ADDR_09]], align 4
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds float, float* [[A_ADDR_09]], i32 3
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP14]], [[Y]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP4]], [[Y]]
 ; CHECK-NEXT:    store float [[ADD]], float* [[B_ADDR_07]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, float* [[B_ADDR_07]], i32 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
@@ -756,70 +732,44 @@
 ; CHECK-NEXT:    [[IND_END3:%.*]] = getelementptr i32, i32* [[B:%.*]], i32 9992
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[Y:%.*]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x i32> undef, i32 [[Y]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT14]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> undef, i32 [[Y]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[INDEX]], 6
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], 6
-; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], 6
-; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 6
-; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32*> undef, i32* [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32*> [[TMP7]], i32* [[NEXT_GEP4]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32*> [[TMP8]], i32* [[NEXT_GEP5]], i32 2
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32*> [[TMP9]], i32* [[NEXT_GEP6]], i32 3
-; CHECK-NEXT:    [[TMP11:%.*]] = or i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], 6
-; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP12]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or i32 [[INDEX]], 5
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], 6
-; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP14]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or i32 [[INDEX]], 6
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], 6
-; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP16]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or i32 [[INDEX]], 7
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 6
-; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP18]]
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i32*> undef, i32* [[NEXT_GEP7]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32*> [[TMP19]], i32* [[NEXT_GEP8]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32*> [[TMP20]], i32* [[NEXT_GEP9]], i32 2
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32*> [[TMP21]], i32* [[NEXT_GEP10]], i32 3
-; CHECK-NEXT:    [[NEXT_GEP11:%.*]] = getelementptr i32, i32* [[B]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP10]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP22]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
-; CHECK-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP24:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER13]], [[BROADCAST_SPLAT15]]
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i32* [[NEXT_GEP11]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP23]], <4 x i32>* [[TMP25]], align 4
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i32, i32* [[NEXT_GEP11]], i32 4
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP24]], <4 x i32>* [[TMP27]], align 4
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi i32* [ [[A]], [[ENTRY:%.*]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], <4 x i32> <i32 0, i32 6, i32 12, i32 18>
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], <4 x i32> <i32 24, i32 30, i32 36, i32 42>
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER5]], [[BROADCAST_SPLAT7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[NEXT_GEP]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[NEXT_GEP]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[INDEX_NEXT]], 9992
-; CHECK-NEXT:    br i1 [[TMP28]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !22
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 9992
+; CHECK-NEXT:    [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i32 48
+; CHECK-NEXT:    br i1 [[TMP7]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !22
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[A_ADDR_08:%.*]] = phi i32* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 9992, [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[B_ADDR_06:%.*]] = phi i32* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP29:%.*]] = load i32, i32* [[A_ADDR_08]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A_ADDR_08]], align 4
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i32, i32* [[A_ADDR_08]], i32 6
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP29]], [[Y]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP8]], [[Y]]
 ; CHECK-NEXT:    store i32 [[ADD]], i32* [[B_ADDR_06]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i32, i32* [[B_ADDR_06]], i32 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_07]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 10000
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop !23
 ;
+
 entry:
   br label %for.body
 
@@ -850,110 +800,53 @@
 ; CHECK-NEXT:    [[IND_END3:%.*]] = getelementptr i32, i32* [[B:%.*]], i32 9984
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[Y:%.*]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT26:%.*]] = insertelement <4 x i32> undef, i32 [[Y]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT27:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT26]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> undef, i32 [[Y]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT30:%.*]] = insertelement <4 x i32> undef, i32 [[Y]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT31:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT30]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i32> undef, i32 [[Y]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT10]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <4 x i32> undef, i32 [[Y]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT13:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT12]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x i32> undef, i32 [[Y]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT14]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[INDEX]], 6
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], 6
-; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], 6
-; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 6
-; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32*> undef, i32* [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32*> [[TMP7]], i32* [[NEXT_GEP4]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32*> [[TMP8]], i32* [[NEXT_GEP5]], i32 2
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32*> [[TMP9]], i32* [[NEXT_GEP6]], i32 3
-; CHECK-NEXT:    [[TMP11:%.*]] = or i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], 6
-; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP12]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or i32 [[INDEX]], 5
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], 6
-; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP14]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or i32 [[INDEX]], 6
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], 6
-; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP16]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or i32 [[INDEX]], 7
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 6
-; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP18]]
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i32*> undef, i32* [[NEXT_GEP7]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32*> [[TMP19]], i32* [[NEXT_GEP8]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32*> [[TMP20]], i32* [[NEXT_GEP9]], i32 2
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32*> [[TMP21]], i32* [[NEXT_GEP10]], i32 3
-; CHECK-NEXT:    [[TMP23:%.*]] = or i32 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP24:%.*]] = mul i32 [[TMP23]], 6
-; CHECK-NEXT:    [[NEXT_GEP11:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP24]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or i32 [[INDEX]], 9
-; CHECK-NEXT:    [[TMP26:%.*]] = mul i32 [[TMP25]], 6
-; CHECK-NEXT:    [[NEXT_GEP12:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP26]]
-; CHECK-NEXT:    [[TMP27:%.*]] = or i32 [[INDEX]], 10
-; CHECK-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], 6
-; CHECK-NEXT:    [[NEXT_GEP13:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP28]]
-; CHECK-NEXT:    [[TMP29:%.*]] = or i32 [[INDEX]], 11
-; CHECK-NEXT:    [[TMP30:%.*]] = mul i32 [[TMP29]], 6
-; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP30]]
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32*> undef, i32* [[NEXT_GEP11]], i32 0
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32*> [[TMP31]], i32* [[NEXT_GEP12]], i32 1
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x i32*> [[TMP32]], i32* [[NEXT_GEP13]], i32 2
-; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <4 x i32*> [[TMP33]], i32* [[NEXT_GEP14]], i32 3
-; CHECK-NEXT:    [[TMP35:%.*]] = or i32 [[INDEX]], 12
-; CHECK-NEXT:    [[TMP36:%.*]] = mul i32 [[TMP35]], 6
-; CHECK-NEXT:    [[NEXT_GEP15:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP36]]
-; CHECK-NEXT:    [[TMP37:%.*]] = or i32 [[INDEX]], 13
-; CHECK-NEXT:    [[TMP38:%.*]] = mul i32 [[TMP37]], 6
-; CHECK-NEXT:    [[NEXT_GEP16:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP38]]
-; CHECK-NEXT:    [[TMP39:%.*]] = or i32 [[INDEX]], 14
-; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], 6
-; CHECK-NEXT:    [[NEXT_GEP17:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP40]]
-; CHECK-NEXT:    [[TMP41:%.*]] = or i32 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP42:%.*]] = mul i32 [[TMP41]], 6
-; CHECK-NEXT:    [[NEXT_GEP18:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP42]]
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32*> undef, i32* [[NEXT_GEP15]], i32 0
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32*> [[TMP43]], i32* [[NEXT_GEP16]], i32 1
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i32*> [[TMP44]], i32* [[NEXT_GEP17]], i32 2
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i32*> [[TMP45]], i32* [[NEXT_GEP18]], i32 3
-; CHECK-NEXT:    [[NEXT_GEP19:%.*]] = getelementptr i32, i32* [[B]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP10]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER23:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP22]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER24:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP34]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER25:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP46]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
-; CHECK-NEXT:    [[TMP47:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP48:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER23]], [[BROADCAST_SPLAT27]]
-; CHECK-NEXT:    [[TMP49:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER24]], [[BROADCAST_SPLAT29]]
-; CHECK-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER25]], [[BROADCAST_SPLAT31]]
-; CHECK-NEXT:    [[TMP51:%.*]] = bitcast i32* [[NEXT_GEP19]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP47]], <4 x i32>* [[TMP51]], align 4
-; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr i32, i32* [[NEXT_GEP19]], i32 4
-; CHECK-NEXT:    [[TMP53:%.*]] = bitcast i32* [[TMP52]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP48]], <4 x i32>* [[TMP53]], align 4
-; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr i32, i32* [[NEXT_GEP19]], i32 8
-; CHECK-NEXT:    [[TMP55:%.*]] = bitcast i32* [[TMP54]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP49]], <4 x i32>* [[TMP55]], align 4
-; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr i32, i32* [[NEXT_GEP19]], i32 12
-; CHECK-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP57]], align 4
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi i32* [ [[A]], [[ENTRY:%.*]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], <4 x i32> <i32 0, i32 6, i32 12, i32 18>
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], <4 x i32> <i32 24, i32 30, i32 36, i32 42>
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], <4 x i32> <i32 48, i32 54, i32 60, i32 66>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], <4 x i32> <i32 72, i32 78, i32 84, i32 90>
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER7]], [[BROADCAST_SPLAT11]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER8]], [[BROADCAST_SPLAT13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER9]], [[BROADCAST_SPLAT15]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[NEXT_GEP]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32* [[NEXT_GEP]], i32 4
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[NEXT_GEP]], i32 8
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[NEXT_GEP]], i32 12
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP14]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP58:%.*]] = icmp eq i32 [[INDEX_NEXT]], 9984
-; CHECK-NEXT:    br i1 [[TMP58]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !24
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 9984
+; CHECK-NEXT:    [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i32 96
+; CHECK-NEXT:    br i1 [[TMP15]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !24
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[A_ADDR_08:%.*]] = phi i32* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 9984, [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[B_ADDR_06:%.*]] = phi i32* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP59:%.*]] = load i32, i32* [[A_ADDR_08]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[A_ADDR_08]], align 4
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i32, i32* [[A_ADDR_08]], i32 6
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP59]], [[Y]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], [[Y]]
 ; CHECK-NEXT:    store i32 [[ADD]], i32* [[B_ADDR_06]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i32, i32* [[B_ADDR_06]], i32 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_07]], 1
@@ -994,66 +887,42 @@
 ; CHECK-NEXT:    [[IND_END3:%.*]] = getelementptr i8, i8* [[Z]], i32 3000
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi i8* [ [[X]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[POINTER_PHI5:%.*]] = phi i8* [ [[Z]], [[VECTOR_PH]] ], [ [[PTR_IND6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[INDEX]], 3
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[X]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], 3
-; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, i8* [[X]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], 3
-; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, i8* [[X]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 3
-; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, i8* [[X]], i32 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i8*> undef, i8* [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i8*> [[TMP7]], i8* [[NEXT_GEP5]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i8*> [[TMP8]], i8* [[NEXT_GEP6]], i32 2
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i8*> [[TMP9]], i8* [[NEXT_GEP7]], i32 3
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i32 [[INDEX]], 3
-; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, i8* [[Z]], i32 [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or i32 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], 3
-; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, i8* [[Z]], i32 [[TMP13]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 3
-; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i8, i8* [[Z]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or i32 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP17:%.*]] = mul i32 [[TMP16]], 3
-; CHECK-NEXT:    [[NEXT_GEP11:%.*]] = getelementptr i8, i8* [[Z]], i32 [[TMP17]]
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i8*> undef, i8* [[NEXT_GEP8]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8*> [[TMP18]], i8* [[NEXT_GEP9]], i32 1
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8*> [[TMP19]], i8* [[NEXT_GEP10]], i32 2
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8*> [[TMP20]], i8* [[NEXT_GEP11]], i32 3
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP10]], i32 1
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> [[TMP10]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef), !alias.scope !26
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP10]], i32 2
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> [[TMP22]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef), !alias.scope !26
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> [[TMP23]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef), !alias.scope !26
-; CHECK-NEXT:    [[TMP24:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], <i8 10, i8 10, i8 10, i8 10>
-; CHECK-NEXT:    [[TMP25:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER12]]
-; CHECK-NEXT:    [[TMP26:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER13]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP21]], i32 1
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> [[TMP24]], <4 x i8*> [[TMP21]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope !29, !noalias !26
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP21]], i32 2
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> [[TMP25]], <4 x i8*> [[TMP27]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope !29, !noalias !26
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> [[TMP26]], <4 x i8*> [[TMP28]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope !29, !noalias !26
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[POINTER_PHI5]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP0]], i32 1
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> [[TMP0]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef), !alias.scope !26
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP0]], i32 2
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> [[TMP2]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef), !alias.scope !26
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> [[TMP3]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef), !alias.scope !26
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], <i8 10, i8 10, i8 10, i8 10>
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER7]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER8]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP1]], i32 1
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> [[TMP4]], <4 x i8*> [[TMP1]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope !29, !noalias !26
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP1]], i32 2
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> [[TMP5]], <4 x i8*> [[TMP7]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope !29, !noalias !26
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> [[TMP6]], <4 x i8*> [[TMP8]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope !29, !noalias !26
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP29]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop !31
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i32 12
+; CHECK-NEXT:    [[PTR_IND6]] = getelementptr i8, i8* [[POINTER_PHI5]], i32 12
+; CHECK-NEXT:    br i1 [[TMP9]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop !31
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[X_ADDR_050:%.*]] = phi i8* [ [[INCDEC_PTR2:%.*]], [[FOR_BODY]] ], [ [[X]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[Z_ADDR_049:%.*]] = phi i8* [ [[INCDEC_PTR34:%.*]], [[FOR_BODY]] ], [ [[Z]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[I_048:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i8, i8* [[X_ADDR_050]], i32 1
-; CHECK-NEXT:    [[TMP30:%.*]] = load i8, i8* [[X_ADDR_050]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, i8* [[X_ADDR_050]], align 1
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i8, i8* [[X_ADDR_050]], i32 2
-; CHECK-NEXT:    [[TMP31:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
 ; CHECK-NEXT:    [[INCDEC_PTR2]] = getelementptr inbounds i8, i8* [[X_ADDR_050]], i32 3
-; CHECK-NEXT:    [[TMP32:%.*]] = load i8, i8* [[INCDEC_PTR1]], align 1
-; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[TMP30]], 10
-; CHECK-NEXT:    [[MUL1:%.*]] = mul i8 [[TMP30]], [[TMP31]]
-; CHECK-NEXT:    [[MUL2:%.*]] = mul i8 [[TMP30]], [[TMP32]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, i8* [[INCDEC_PTR1]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[TMP10]], 10
+; CHECK-NEXT:    [[MUL1:%.*]] = mul i8 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i8 [[TMP10]], [[TMP12]]
 ; CHECK-NEXT:    [[INCDEC_PTR32:%.*]] = getelementptr inbounds i8, i8* [[Z_ADDR_049]], i32 1
 ; CHECK-NEXT:    store i8 [[MUL]], i8* [[Z_ADDR_049]], align 1
 ; CHECK-NEXT:    [[INCDEC_PTR33:%.*]] = getelementptr inbounds i8, i8* [[Z_ADDR_049]], i32 2
Index: llvm/test/Transforms/LoopVectorize/pointer-induction.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/pointer-induction.ll
@@ -0,0 +1,127 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nofree norecurse nounwind
+define void @a(i8* readnone %b) local_unnamed_addr #0 {
+; CHECK-LABEL: @a(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_NOT4:%.*]] = icmp eq i8* [[B:%.*]], null
+; CHECK-NEXT:    br i1 [[CMP_NOT4]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[B1:%.*]] = ptrtoint i8* [[B]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 0, [[B1]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], -4
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 0, [[N_VEC]]
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, i8* null, i64 [[TMP1]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi i8* [ null, [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[PRED_STORE_CONTINUE7:%.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE7]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], <4 x i64> <i64 0, i64 -1, i64 -2, i64 -3>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP2]], i64 -1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i8*> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP4]], i64 -3
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <4 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP6]], align 1, !tbaa !2
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <4 x i8> [[REVERSE]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i8*> [[TMP3]], i32 0
+; CHECK-NEXT:    store i8 95, i8* [[TMP9]], align 1, !tbaa !2
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]]
+; CHECK:       pred.store.if2:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i8*> [[TMP3]], i32 1
+; CHECK-NEXT:    store i8 95, i8* [[TMP11]], align 1, !tbaa !2
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE3]]
+; CHECK:       pred.store.continue3:
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP7]], i32 2
+; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
+; CHECK:       pred.store.if4:
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i8*> [[TMP3]], i32 2
+; CHECK-NEXT:    store i8 95, i8* [[TMP13]], align 1, !tbaa !2
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE5]]
+; CHECK:       pred.store.continue5:
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3
+; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]]
+; CHECK:       pred.store.if6:
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i8*> [[TMP3]], i32 3
+; CHECK-NEXT:    store i8 95, i8* [[TMP15]], align 1, !tbaa !2
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE7]]
+; CHECK:       pred.store.continue7:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 -4
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ null, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[C_05:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[C_05]], i64 -1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1, !tbaa !2
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP17]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i8 95, i8* [[INCDEC_PTR]], align 1, !tbaa !2
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i8* [[INCDEC_PTR]], [[B]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !7
+;
+entry:
+  %cmp.not4 = icmp eq i8* %b, null
+  br i1 %cmp.not4, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %if.end
+  %c.05 = phi i8* [ %incdec.ptr, %if.end ], [ null, %for.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %c.05, i64 -1
+  %0 = load i8, i8* %incdec.ptr, align 1, !tbaa !2
+  %tobool.not = icmp eq i8 %0, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  store i8 95, i8* %incdec.ptr, align 1, !tbaa !2
+  br label %if.end
+
+if.end:                                           ; preds = %for.body, %if.then
+  %cmp.not = icmp eq i8* %incdec.ptr, %b
+  br i1 %cmp.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 11.0.0 "}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+