diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1157,6 +1157,14 @@
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                      unsigned Index = -1) const;
 
+  /// \return The expected cost of vector Insert and Extract.
+  /// Caller guarantees that 'I' is not nullptr.
+  ///
+  /// A typical suitable use case is cost estimation when vector instruction
+  /// exists (e.g., from basic blocks during transformation).
+  InstructionCost getVectorInstrCost(const Instruction *I, Type *Val,
+                                     unsigned Index = -1) const;
+
   /// \return The cost of replication shuffle of \p VF elements typed \p EltTy
   /// \p ReplicationFactor times.
   ///
@@ -1723,6 +1731,8 @@
                                              const Instruction *I) = 0;
   virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                              unsigned Index) = 0;
+  virtual InstructionCost getVectorInstrCost(const Instruction *I, Type *Val,
+                                             unsigned Index) = 0;
 
   virtual InstructionCost
   getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF,
@@ -2271,6 +2281,10 @@
                                      unsigned Index) override {
     return Impl.getVectorInstrCost(Opcode, Val, Index);
   }
+  InstructionCost getVectorInstrCost(const Instruction *I, Type *Val,
+                                     unsigned Index) override {
+    return Impl.getVectorInstrCost(I, Val, Index);
+  }
   InstructionCost
   getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF,
                             const APInt &DemandedDstElts,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -572,6 +572,10 @@
                                      unsigned Index) const {
     return 1;
   }
+  InstructionCost getVectorInstrCost(const Instruction *I, Type *Val,
+                                     unsigned Index) const {
+    return 1;
+  }
 
   unsigned getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF,
                                      const APInt &DemandedDstElts,
@@ -1139,7 +1143,7 @@
       if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
         if (CI->getValue().getActiveBits() <= 32)
           Idx = CI->getZExtValue();
-      return TargetTTI->getVectorInstrCost(Opcode, Ty, Idx);
+      return TargetTTI->getVectorInstrCost(IE, Ty, Idx);
     }
     case Instruction::ShuffleVector: {
       auto *Shuffle = dyn_cast<ShuffleVectorInst>(U);
@@ -1229,7 +1233,7 @@
         if (CI->getValue().getActiveBits() <= 32)
           Idx = CI->getZExtValue();
       Type *DstTy = U->getOperand(0)->getType();
-      return TargetTTI->getVectorInstrCost(Opcode, DstTy, Idx);
+      return TargetTTI->getVectorInstrCost(EEI, DstTy, Idx);
     }
     }
     // By default, just classify everything as 'basic'.
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1154,6 +1154,11 @@
     return LT.first;
   }
 
+  InstructionCost getVectorInstrCost(const Instruction *I, Type *Val,
+                                     unsigned Index) {
+    return thisT()->getVectorInstrCost(I->getOpcode(), Val, Index);
+  }
+
   InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
                                             int VF,
                                             const APInt &DemandedDstElts,
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -863,11 +863,29 @@
 InstructionCost TargetTransformInfo::getVectorInstrCost(unsigned Opcode,
                                                         Type *Val,
                                                         unsigned Index) const {
+  // FIXME: Assert that Opcode is either InsertElement or ExtractElement.
+  // This is mentioned in the interface description and respected by all
+  // callers, but never asserted upon.
+  assert((Opcode == Instruction::ExtractElement ||
+          Opcode == Instruction::InsertElement) &&
+         "Expect InsertElement or ExtractElement");
   InstructionCost Cost = TTIImpl->getVectorInstrCost(Opcode, Val, Index);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
 
+InstructionCost TargetTransformInfo::getVectorInstrCost(const Instruction *I,
+                                                        Type *Val,
+                                                        unsigned Index) const {
+  assert((I != nullptr) && "Expect not-null instruction pointer");
+  assert((I->getOpcode() == Instruction::ExtractElement ||
+          I->getOpcode() == Instruction::InsertElement) &&
+         "Expect InsertElement or ExtractElement");
+  InstructionCost Cost = TTIImpl->getVectorInstrCost(I, Val, Index);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
+}
+
 InstructionCost TargetTransformInfo::getReplicationShuffleCost(
     Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts,
     TTI::TargetCostKind CostKind) {
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -7230,7 +7230,7 @@
     // scalar to vector.
     // The vector chain has to account for the combining cost.
     InstructionCost ScalarCost =
-        TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index);
+        TTI.getVectorInstrCost(Transition, PromotedType, Index);
     InstructionCost VectorCost = StoreExtractCombineCost;
     enum TargetTransformInfo::TargetCostKind CostKind =
       TargetTransformInfo::TCK_RecipThroughput;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -173,8 +173,11 @@
   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
                                  const Instruction *I = nullptr);
 
+  using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                      unsigned Index);
+  InstructionCost getVectorInstrCost(const Instruction *I, Type *Val,
+                                     unsigned Index);
 
   InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
                                          bool IsUnsigned,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1857,6 +1857,44 @@
   return ST->getVectorInsertExtractBaseCost();
 }
 
+InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction *I,
+                                                   Type *Val, unsigned Index) {
+  unsigned Opcode = I->getOpcode();
+  InstructionCost cost = this->getVectorInstrCost(Opcode, Val, Index);
+
+  auto IsExtractedElementUsedAsInteger =
+      [Val](const Instruction *Inst) -> bool {
+    if (!isa_and_nonnull<ExtractElementInst>(Inst) ||
+        !Val->getScalarType()->isIntegerTy())
+      return false;
+
+    // According to NEON programmer guide, other than multiply instructions,
+    // instructions that access scalars can access any element in the register
+    // file.
+    //
+    // The cost of extracting a scalar element from a vector register depends
+    // on how scalar will be used:
+    // 1. If users could use scalars in vector registers directly, the
+    // extract-element
+    //    operation is essentially free.
+    // 2. If the user instruction requires core register as operand (i.e.,
+    // cannot use
+    //    scalars in vector register), an explicit move operation will be
+    //    codegen'd.
+
+    // FIXME:
+    // Do more accurate cost estimation by analyzing the uses of instruction.
+
+    return !Inst->use_empty();
+  };
+
+  // 'cost' might be an optimistic 0 when lane is 0.
+  // Returns the base cost if we know an explicit move is needed.
+  return IsExtractedElementUsedAsInteger(I)
+             ? ST->getVectorInsertExtractBaseCost()
+             : cost;
+}
+
 InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
     TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -159,6 +159,7 @@
   bool isInlineAsmSourceOfDivergence(const CallInst *CI,
                                      ArrayRef<unsigned> Indices = {}) const;
 
+  using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
                                      unsigned Index);
   bool isSourceOfDivergence(const Value *V) const;
diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
--- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
@@ -60,6 +60,7 @@
   unsigned getMaxInterleaveFactor(unsigned VF);
   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
                                  const Instruction *I = nullptr);
+  using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
                                      unsigned Index);
 };
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -237,6 +237,7 @@
                                      TTI::TargetCostKind CostKind,
                                      const Instruction *I = nullptr);
 
+  using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                      unsigned Index);
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -151,6 +151,7 @@
                                    TTI::CastContextHint CCH,
                                    TTI::TargetCostKind CostKind,
                                    const Instruction *I = nullptr);
+  using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                      unsigned Index);
 
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -123,6 +123,7 @@
                                      CmpInst::Predicate VecPred,
                                      TTI::TargetCostKind CostKind,
                                      const Instruction *I = nullptr);
+  using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                      unsigned Index);
   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -107,6 +107,7 @@
                                      CmpInst::Predicate VecPred,
                                      TTI::TargetCostKind CostKind,
                                      const Instruction *I = nullptr);
+  using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                      unsigned Index);
   bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -67,6 +67,7 @@
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
       const Instruction *CxtI = nullptr);
+  using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                      unsigned Index);
 
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -146,6 +146,7 @@
                                      CmpInst::Predicate VecPred,
                                      TTI::TargetCostKind CostKind,
                                      const Instruction *I = nullptr);
+  using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                      unsigned Index);
   InstructionCost getScalarizationOverhead(VectorType *Ty,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5873,8 +5873,7 @@
           continue;
         }
       }
-      Cost -= TTIRef.getVectorInstrCost(Instruction::ExtractElement,
-                                        EE->getVectorOperandType(), Idx);
+      Cost -= TTIRef.getVectorInstrCost(EE, EE->getVectorOperandType(), Idx);
     }
     // Add a cost for subvector extracts/inserts if required.
     for (const auto &Data : ExtractVectorsTys) {
@@ -6107,9 +6106,8 @@
         for (unsigned I : E->ReuseShuffleIndices) {
           if (ShuffleOrOp == Instruction::ExtractElement) {
             auto *EE = cast<ExtractElementInst>(VL[I]);
-            CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
-                                                  EE->getVectorOperandType(),
-                                                  *getExtractIndex(EE));
+            CommonCost -= TTI->getVectorInstrCost(
+                EE, EE->getVectorOperandType(), *getExtractIndex(EE));
           } else {
             CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
                                                   VecTy, Idx);
@@ -6120,9 +6118,8 @@
         for (Value *V : VL) {
           if (ShuffleOrOp == Instruction::ExtractElement) {
             auto *EE = cast<ExtractElementInst>(V);
-            CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement,
-                                                  EE->getVectorOperandType(),
-                                                  *getExtractIndex(EE));
+            CommonCost += TTI->getVectorInstrCost(
+                EE, EE->getVectorOperandType(), *getExtractIndex(EE));
           } else {
             --Idx;
             CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement,
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -270,10 +270,8 @@
 
   Type *VecTy = Ext0->getVectorOperand()->getType();
   assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
-  InstructionCost Cost0 =
-      TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
-  InstructionCost Cost1 =
-      TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
+  InstructionCost Cost0 = TTI.getVectorInstrCost(Ext0, VecTy, Index0);
+  InstructionCost Cost1 = TTI.getVectorInstrCost(Ext1, VecTy, Index1);
 
   // If both costs are invalid no shuffle is needed
   if (!Cost0.isValid() && !Cost1.isValid())
@@ -337,10 +335,8 @@
   unsigned Ext0Index = Ext0IndexC->getZExtValue();
   unsigned Ext1Index = Ext1IndexC->getZExtValue();
 
-  InstructionCost Extract0Cost =
-      TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index);
-  InstructionCost Extract1Cost =
-      TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext1Index);
+  InstructionCost Extract0Cost = TTI.getVectorInstrCost(Ext0, VecTy, Ext0Index);
+  InstructionCost Extract1Cost = TTI.getVectorInstrCost(Ext1, VecTy, Ext1Index);
 
   // A more expensive extract will always be replaced by a splat shuffle.
   // For example, if Ext0 is more expensive:
@@ -754,9 +750,8 @@
   if (!VecTy)
     return false;
 
-  InstructionCost OldCost =
-      TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
-  OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
+  InstructionCost OldCost = TTI.getVectorInstrCost(Ext0, VecTy, Index0);
+  OldCost += TTI.getVectorInstrCost(Ext1, VecTy, Index1);
   OldCost +=
       TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(),
                              CmpInst::makeCmpResultType(I0->getType()), Pred) *
@@ -776,7 +771,7 @@
   NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy,
                                 ShufMask);
   NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy);
-  NewCost += TTI.getVectorInstrCost(Ext0->getOpcode(), CmpTy, CheapIndex);
+  NewCost += TTI.getVectorInstrCost(Ext0, CmpTy, CheapIndex);
 
   // Aggressively form vector ops if the cost is equal because the transform
   // may enable further optimization.
diff --git a/llvm/test/Analysis/CostModel/AArch64/kryo.ll b/llvm/test/Analysis/CostModel/AArch64/kryo.ll
--- a/llvm/test/Analysis/CostModel/AArch64/kryo.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/kryo.ll
@@ -21,26 +21,22 @@
     ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 1
     %t3 = insertelement <2 x i64> undef, i64 undef, i32 0
     %t4 = insertelement <2 x i64> undef, i64 undef, i32 1
-
     ret void
 }
 
 ; CHECK-LABEL: vectorInstrExtractCost
 define i64 @vectorInstrExtractCost(<4 x i64> %vecreg) {
-    
-    ; Vector extracts - extracting each element at index 0 is considered
-    ; free in the current implementation. When extracting element at index
-    ; 2, 2 is rounded to 0, so extracting element at index 2 has cost 0 as 
-    ; well.
-    ;
     ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 1
-    ; CHECK: cost of 0 {{.*}} extractelement <4 x i64> %vecreg, i32 2
+    ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 2
     %t1 = extractelement <4 x i64> %vecreg, i32 1
     %t2 = extractelement <4 x i64> %vecreg, i32 2
     %ele = add i64 %t2, 1
     %cond = icmp eq i64 %t1, %ele
 
-    ; CHECK: cost of 0 {{.*}} extractelement <4 x i64> %vecreg, i32 0
+    ; Vector extracts - extracting each element should have a cost
+    ; if they are used as integers.
+    ;
+    ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 0
     ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 3
     %t0 = extractelement <4 x i64> %vecreg, i32 0
     %t3 = extractelement <4 x i64> %vecreg, i32 3
diff --git a/llvm/test/Transforms/LICM/AArch64/extract-element.ll b/llvm/test/Transforms/LICM/AArch64/extract-element.ll
--- a/llvm/test/Transforms/LICM/AArch64/extract-element.ll
+++ b/llvm/test/Transforms/LICM/AArch64/extract-element.ll
@@ -18,24 +18,23 @@
 ; CHECK-NEXT:    [[TMP12]] = add i64 [[TMP4]], 1
 ; CHECK-NEXT:    br label [[TMP3]]
 ; CHECK:       .split.loop.exit:
-; CHECK-NEXT:    [[DOTLCSSA7:%.*]] = phi <1 x i64> [ [[TMP8]], [[TMP6]] ]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i64 [ [[TMP9]], [[TMP6]] ]
 ; CHECK-NEXT:    [[DOTLCSSA6:%.*]] = phi i64 [ [[TMP4]], [[TMP6]] ]
 ; CHECK-NEXT:    [[DOTPH:%.*]] = phi i1 [ [[TMP5]], [[TMP6]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[DOTLCSSA7]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], -1
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[DOTLCSSA6]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp uge i64 [[TMP15]], [[TMP1]]
-; CHECK-NEXT:    br label [[TMP17:%.*]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[DOTLCSSA]], -1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP13]], [[DOTLCSSA6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp uge i64 [[TMP14]], [[TMP1]]
+; CHECK-NEXT:    br label [[TMP16:%.*]]
 ; CHECK:       .split.loop.exit2:
 ; CHECK-NEXT:    [[DOTPH3:%.*]] = phi i1 [ [[TMP5]], [[TMP3]] ]
 ; CHECK-NEXT:    [[DOTPH4:%.*]] = phi i1 [ undef, [[TMP3]] ]
-; CHECK-NEXT:    br label [[TMP17]]
-; CHECK:       17:
-; CHECK-NEXT:    [[TMP18:%.*]] = phi i1 [ [[DOTPH]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH3]], [[DOTSPLIT_LOOP_EXIT2]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = phi i1 [ [[TMP16]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH4]], [[DOTSPLIT_LOOP_EXIT2]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = xor i1 [[TMP18]], true
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i1 true, i1 [[TMP19]]
-; CHECK-NEXT:    ret i1 [[TMP21]]
+; CHECK-NEXT:    br label [[TMP16]]
+; CHECK:       16:
+; CHECK-NEXT:    [[TMP17:%.*]] = phi i1 [ [[DOTPH]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH3]], [[DOTSPLIT_LOOP_EXIT2]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = phi i1 [ [[TMP15]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH4]], [[DOTSPLIT_LOOP_EXIT2]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = xor i1 [[TMP17]], true
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i1 true, i1 [[TMP18]]
+; CHECK-NEXT:    ret i1 [[TMP20]]
 ;
   br label %3