diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1154,8 +1154,11 @@
 
   /// \return The expected cost of vector Insert and Extract.
   /// Use -1 to indicate that there is no information on the index value.
+  /// Caller can provide the instruction that holds Opcode in `I`, or use
+  /// nullptr to indcate there is no instruction information.
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     unsigned Index = -1) const;
+                                     unsigned Index = -1,
+                                     const Instruction *I = nullptr) const;
 
   /// \return The cost of replication shuffle of \p VF elements typed \p EltTy
   /// \p ReplicationFactor times.
@@ -1722,7 +1725,8 @@
                                              TTI::TargetCostKind CostKind,
                                              const Instruction *I) = 0;
   virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                             unsigned Index) = 0;
+                                             unsigned Index,
+                                             const Instruction *I) = 0;
 
   virtual InstructionCost
   getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF,
@@ -2267,9 +2271,9 @@
                                      const Instruction *I) override {
     return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
   }
-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     unsigned Index) override {
-    return Impl.getVectorInstrCost(Opcode, Val, Index);
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index,
+                                     const Instruction *I) override {
+    return Impl.getVectorInstrCost(Opcode, Val, Index, I);
   }
   InstructionCost
   getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -568,8 +568,8 @@
     return 1;
   }
 
-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     unsigned Index) const {
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index,
+                                     const Instruction *I) const {
     return 1;
   }
 
@@ -1139,7 +1139,7 @@
       if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
         if (CI->getValue().getActiveBits() <= 32)
           Idx = CI->getZExtValue();
-      return TargetTTI->getVectorInstrCost(Opcode, Ty, Idx);
+      return TargetTTI->getVectorInstrCost(Opcode, Ty, Idx, I);
     }
     case Instruction::ShuffleVector: {
       auto *Shuffle = dyn_cast<ShuffleVectorInst>(U);
@@ -1229,7 +1229,7 @@
         if (CI->getValue().getActiveBits() <= 32)
           Idx = CI->getZExtValue();
       Type *DstTy = U->getOperand(0)->getType();
-      return TargetTTI->getVectorInstrCost(Opcode, DstTy, Idx);
+      return TargetTTI->getVectorInstrCost(Opcode, DstTy, Idx, I);
     }
     }
     // By default, just classify everything as 'basic'.
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -88,10 +88,12 @@
     InstructionCost Cost = 0;
     // Broadcast cost is equal to the cost of extracting the zero'th element
     // plus the cost of inserting it into every element of the result vector.
-    Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0);
+    Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0,
+                                        nullptr);
 
     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
-      Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
+      Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i,
+                                          nullptr);
     }
     return Cost;
   }
@@ -108,8 +110,10 @@
     // vector and finally index 3 of second vector and insert them at index
     // <0,1,2,3> of result vector.
     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
-      Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
-      Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i);
+      Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i,
+                                          nullptr);
+      Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i,
+                                          nullptr);
     }
     return Cost;
   }
@@ -132,9 +136,9 @@
     // type.
     for (int i = 0; i != NumSubElts; ++i) {
       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
-                                          i + Index);
-      Cost +=
-          thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i);
+                                          i + Index, nullptr);
+      Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i,
+                                          nullptr);
     }
     return Cost;
   }
@@ -156,10 +160,10 @@
     // the source type plus the cost of inserting them into the result vector
     // type.
     for (int i = 0; i != NumSubElts; ++i) {
-      Cost +=
-          thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i);
+      Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy,
+                                          i, nullptr);
       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
-                                          i + Index);
+                                          i + Index, nullptr);
     }
     return Cost;
   }
@@ -210,7 +214,7 @@
                                  FixedVectorType::get(
                                      PointerType::get(VT->getElementType(), 0),
                                      VT->getNumElements()),
-                                 -1)
+                                 -1, nullptr)
             : 0;
     InstructionCost LoadCost =
         VT->getNumElements() *
@@ -235,7 +239,7 @@
                Instruction::ExtractElement,
                FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
                                     VT->getNumElements()),
-               -1) +
+               -1, nullptr) +
            getCFInstrCost(Instruction::Br, CostKind) +
            getCFInstrCost(Instruction::PHI, CostKind));
     }
@@ -716,9 +720,11 @@
       if (!DemandedElts[i])
         continue;
       if (Insert)
-        Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+        Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i,
+                                            nullptr);
       if (Extract)
-        Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+        Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i,
+                                            nullptr);
     }
 
     return Cost;
@@ -1083,8 +1089,9 @@
 
   InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
                                            VectorType *VecTy, unsigned Index) {
+    // FIXME: Pass instruction pointer.
     return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
-                                       Index) +
+                                       Index, nullptr) +
            thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
                                      TTI::CastContextHint::None,
                                      TTI::TCK_RecipThroughput);
@@ -1146,8 +1153,8 @@
     return 1;
   }
 
-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     unsigned Index) {
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index,
+                                     const Instruction *I = nullptr) {
     std::pair<InstructionCost, MVT> LT =
         getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());
 
@@ -2205,7 +2212,8 @@
     ArithCost +=
         NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
     return ShuffleCost + ArithCost +
-           thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
+           thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0,
+                                       nullptr);
   }
 
   /// Try to calculate the cost of performing strict (in-order) reductions,
@@ -2311,7 +2319,8 @@
     // The last min/max should be in vector registers and we counted it above.
     // So just need a single extractelement.
     return ShuffleCost + MinMaxCost +
-           thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
+           thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0,
+                                       nullptr);
   }
 
   InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -860,10 +860,9 @@
   return Cost;
 }
 
-InstructionCost TargetTransformInfo::getVectorInstrCost(unsigned Opcode,
-                                                        Type *Val,
-                                                        unsigned Index) const {
-  InstructionCost Cost = TTIImpl->getVectorInstrCost(Opcode, Val, Index);
+InstructionCost TargetTransformInfo::getVectorInstrCost(
+    unsigned Opcode, Type *Val, unsigned Index, const Instruction *I) const {
+  InstructionCost Cost = TTIImpl->getVectorInstrCost(Opcode, Val, Index, I);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -7229,8 +7229,8 @@
     // The scalar chain of computation has to pay for the transition
     // scalar to vector.
     // The vector chain has to account for the combining cost.
-    InstructionCost ScalarCost =
-        TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index);
+    InstructionCost ScalarCost = TTI.getVectorInstrCost(
+        Transition->getOpcode(), PromotedType, Index, Transition);
     InstructionCost VectorCost = StoreExtractCombineCost;
     enum TargetTransformInfo::TargetCostKind CostKind =
       TargetTransformInfo::TCK_RecipThroughput;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -173,8 +173,8 @@
   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
                                  const Instruction *I = nullptr);
 
-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     unsigned Index);
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index,
+                                     const Instruction *I);
 
   InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
                                          bool IsUnsigned,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1776,8 +1776,11 @@
 
   // Get the cost for the extract. We compute the cost (if any) for the extend
   // below.
+  //
+  // FIXME: Change signature of `getExtractWithExtendCost` so caller
+  // can optionally provide the extract-element instruction as context.
   InstructionCost Cost =
-      getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
+      getVectorInstrCost(Instruction::ExtractElement, VecTy, Index, nullptr);
 
   // Legalize the types.
   auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
@@ -1830,7 +1833,8 @@
 }
 
 InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                                   unsigned Index) {
+                                                   unsigned Index,
+                                                   const Instruction *I) {
   assert(Val->isVectorTy() && "This must be a vector type");
 
   if (Index != -1U) {
@@ -1848,8 +1852,45 @@
       Index = Index % Width;
     }
 
-    // The element at index zero is already inside the vector.
-    if (Index == 0)
+    // FIXME: Use `IsExtractedElementUsedAsInteger` to decide cost when index is
+    // explicit (i.e., not -1, not limited to 0).
+    auto IsExtractedElementUsedAsInteger =
+        [Val](const Instruction *Inst) -> bool {
+      if (!isa_and_nonnull<ExtractElementInst>(Inst) ||
+          !Val->getScalarType()->isIntegerTy())
+        return false;
+
+      // Inst is an ExtractElementInst and element type in the vector is
+      // integer.
+      bool UsedAsInteger = false;
+      for (const Use &U : Inst->uses()) {
+        Instruction *UserI = dyn_cast<Instruction>(U.getUser());
+
+        assert(UserI && "All users of an instruction should be instructions");
+
+        // If the extracted element is used in a store operation, it must be the
+        // value (not pointer); and it could be stored bit-wise without an
+        // explicit conversion to integer.
+        if (isa<StoreInst>(UserI))
+          continue;
+
+        // If used as a floating point value, users can access the subregister directly.
+        if (isa<UIToFPInst>(UserI) || isa<SIToFPInst>(UserI))
+          continue;
+
+        // For the rest of cases, assume that the extracted element will be used
+        // as a scalar/integer.
+        UsedAsInteger = true;
+        break;
+      }
+      return UsedAsInteger;
+    };
+
+    // For extract_element instruction, the element at index zero is already
+    // inside the floating-point subregister.
+    //
+    // If element type is integer, an explicit move operation will be codegen'd.
+    if (Index == 0 && !IsExtractedElementUsedAsInteger(I))
       return 0;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -160,7 +160,7 @@
                                      ArrayRef<unsigned> Indices = {}) const;
 
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
-                                     unsigned Index);
+                                     unsigned Index, const Instruction *I);
   bool isSourceOfDivergence(const Value *V) const;
   bool isAlwaysUniform(const Value *V) const;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -789,7 +789,8 @@
 }
 
 InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
-                                               unsigned Index) {
+                                               unsigned Index,
+                                               const Instruction *I) {
   switch (Opcode) {
   case Instruction::ExtractElement:
   case Instruction::InsertElement: {
@@ -798,7 +799,7 @@
     if (EltSize < 32) {
       if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
         return 0;
-      return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+      return BaseT::getVectorInstrCost(Opcode, ValTy, Index, I);
     }
 
     // Extracts are just reads of a subregister, so are free. Inserts are
@@ -809,7 +810,7 @@
     return Index == ~0u ? 2 : 0;
   }
   default:
-    return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+    return BaseT::getVectorInstrCost(Opcode, ValTy, Index, I);
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
--- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
@@ -61,7 +61,7 @@
   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
                                  const Instruction *I = nullptr);
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
-                                     unsigned Index);
+                                     unsigned Index, const Instruction *I);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
--- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
@@ -108,14 +108,15 @@
 }
 
 InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
-                                                unsigned Index) {
+                                                unsigned Index,
+                                                const Instruction *I) {
   switch (Opcode) {
   case Instruction::ExtractElement:
   case Instruction::InsertElement: {
     unsigned EltSize =
         DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
     if (EltSize < 32) {
-      return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+      return BaseT::getVectorInstrCost(Opcode, ValTy, Index, I);
     }
 
     // Extracts are just reads of a subregister, so are free. Inserts are
@@ -126,7 +127,7 @@
     return Index == ~0u ? 2 : 0;
   }
   default:
-    return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+    return BaseT::getVectorInstrCost(Opcode, ValTy, Index, I);
   }
 }
 
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -237,8 +237,8 @@
                                      TTI::TargetCostKind CostKind,
                                      const Instruction *I = nullptr);
 
-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     unsigned Index);
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index,
+                                     const Instruction *I);
 
   InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE,
                                             const SCEV *Ptr);
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -872,7 +872,8 @@
 }
 
 InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
-                                               unsigned Index) {
+                                               unsigned Index,
+                                               const Instruction *I) {
   // Penalize inserting into an D-subregister. We end up with a three times
   // lower estimated throughput on swift.
   if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
@@ -891,7 +892,7 @@
     if (ValTy->isVectorTy() &&
         ValTy->getScalarSizeInBits() <= 32)
       return std::max<InstructionCost>(
-          BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
+          BaseT::getVectorInstrCost(Opcode, ValTy, Index, I), 2U);
   }
 
   if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
@@ -904,7 +905,7 @@
     return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
   }
 
-  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+  return BaseT::getVectorInstrCost(Opcode, ValTy, Index, I);
 }
 
 InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -151,8 +151,8 @@
                                    TTI::CastContextHint CCH,
                                    TTI::TargetCostKind CostKind,
                                    const Instruction *I = nullptr);
-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     unsigned Index);
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index,
+                                     const Instruction *I);
 
   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
                                  const Instruction *I = nullptr) {
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -306,7 +306,8 @@
 }
 
 InstructionCost HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                                   unsigned Index) {
+                                                   unsigned Index,
+                                                   const Instruction *I) {
   Type *ElemTy = Val->isVectorTy() ? cast<VectorType>(Val)->getElementType()
                                    : Val;
   if (Opcode == Instruction::InsertElement) {
@@ -315,7 +316,8 @@
     if (ElemTy->isIntegerTy(32))
       return Cost;
     // If it's not a 32-bit value, there will need to be an extract.
-    return Cost + getVectorInstrCost(Instruction::ExtractElement, Val, Index);
+    return Cost +
+           getVectorInstrCost(Instruction::ExtractElement, Val, Index, I);
   }
 
   if (Opcode == Instruction::ExtractElement)
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -123,8 +123,8 @@
                                      CmpInst::Predicate VecPred,
                                      TTI::TargetCostKind CostKind,
                                      const Instruction *I = nullptr);
-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     unsigned Index);
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index,
+                                     const Instruction *I);
   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
                                   MaybeAlign Alignment, unsigned AddressSpace,
                                   TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -1071,7 +1071,8 @@
 }
 
 InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                               unsigned Index) {
+                                               unsigned Index,
+                                               const Instruction *I) {
   assert(Val->isVectorTy() && "This must be a vector type");
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -1081,7 +1082,7 @@
   if (!CostFactor.isValid())
     return InstructionCost::getMax();
 
-  InstructionCost Cost = BaseT::getVectorInstrCost(Opcode, Val, Index);
+  InstructionCost Cost = BaseT::getVectorInstrCost(Opcode, Val, Index, I);
   Cost *= CostFactor;
 
   if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
@@ -1146,7 +1147,6 @@
                                             unsigned AddressSpace,
                                             TTI::TargetCostKind CostKind,
                                             const Instruction *I) {
-
   InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Src, nullptr);
   if (!CostFactor.isValid())
     return InstructionCost::getMax();
@@ -1222,7 +1222,7 @@
   if (Src->isVectorTy() && Opcode == Instruction::Store)
     for (int i = 0, e = cast<FixedVectorType>(Src)->getNumElements(); i < e;
          ++i)
-      Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
+      Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i, I);
 
   return Cost;
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -107,8 +107,8 @@
                                      CmpInst::Predicate VecPred,
                                      TTI::TargetCostKind CostKind,
                                      const Instruction *I = nullptr);
-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     unsigned Index);
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index,
+                                     const Instruction *I);
   bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue);
   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
                                   MaybeAlign Alignment, unsigned AddressSpace,
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -997,7 +997,8 @@
 }
 
 InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                                   unsigned Index) {
+                                                   unsigned Index,
+                                                   const Instruction *I) {
   // vlvgp will insert two grs into a vector register, so only count half the
   // number of instructions.
   if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
@@ -1013,7 +1014,7 @@
     return Cost;
   }
 
-  return BaseT::getVectorInstrCost(Opcode, Val, Index);
+  return BaseT::getVectorInstrCost(Opcode, Val, Index, I);
 }
 
 // Check if a load may be folded as a memory operand in its user.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -67,8 +67,8 @@
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
       const Instruction *CxtI = nullptr);
-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     unsigned Index);
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index,
+                                     const Instruction *I);
 
   /// @}
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -84,9 +84,10 @@
 
 InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode,
                                                        Type *Val,
-                                                       unsigned Index) {
+                                                       unsigned Index,
+                                                       const Instruction *I) {
   InstructionCost Cost =
-      BasicTTIImplBase::getVectorInstrCost(Opcode, Val, Index);
+      BasicTTIImplBase::getVectorInstrCost(Opcode, Val, Index, I);
 
   // SIMD128's insert/extract currently only take constant indices.
   if (Index == -1u)
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -146,8 +146,8 @@
                                      CmpInst::Predicate VecPred,
                                      TTI::TargetCostKind CostKind,
                                      const Instruction *I = nullptr);
-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     unsigned Index);
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index,
+                                     const Instruction *I);
   InstructionCost getScalarizationOverhead(VectorType *Ty,
                                            const APInt &DemandedElts,
                                            bool Insert, bool Extract);
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3644,7 +3644,8 @@
 }
 
 InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                               unsigned Index) {
+                                               unsigned Index,
+                                               const Instruction *I) {
   static const CostTblEntry SLMCostTbl[] = {
      { ISD::EXTRACT_VECTOR_ELT,       MVT::i8,      4 },
      { ISD::EXTRACT_VECTOR_ELT,       MVT::i16,     4 },
@@ -3772,7 +3773,8 @@
   if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
     RegisterFileMoveCost += 1;
 
-  return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
+  return BaseT::getVectorInstrCost(Opcode, Val, Index, I) +
+         RegisterFileMoveCost;
 }
 
 InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
@@ -3901,7 +3903,8 @@
         for (unsigned I = 0; I != NumElts; ++I)
           if (WidenedDemandedElts[I]) {
             unsigned Idx = I % Scale;
-            Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, Idx);
+            Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, Idx,
+                                       nullptr);
           }
 
         return Cost;
@@ -4512,7 +4515,8 @@
   }
 
   // Add the final extract element to the cost.
-  return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
+  return ReductionCost +
+         getVectorInstrCost(Instruction::ExtractElement, Ty, 0, nullptr);
 }
 
 InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy,
@@ -4813,7 +4817,8 @@
   }
 
   // Add the final extract element to the cost.
-  return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
+  return MinMaxCost +
+         getVectorInstrCost(Instruction::ExtractElement, Ty, 0, nullptr);
 }
 
 /// Calculate the cost of materializing a 64-bit value. This helper
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6405,6 +6405,7 @@
   }
   StoreInst *SI = cast<StoreInst>(I);
 
+  // FIXME: Use a specific type rather than unknown.
   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
   return TTI.getAddressComputationCost(ValTy) +
          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
@@ -6412,7 +6413,7 @@
          (isLoopInvariantStoreValue
               ? 0
               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
-                                       VF.getKnownMinValue() - 1));
+                                       VF.getKnownMinValue() - 1, nullptr));
 }
 
 InstructionCost
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5874,7 +5874,7 @@
         }
       }
       Cost -= TTIRef.getVectorInstrCost(Instruction::ExtractElement,
-                                        EE->getVectorOperandType(), Idx);
+                                        EE->getVectorOperandType(), Idx, EE);
     }
     // Add a cost for subvector extracts/inserts if required.
     for (const auto &Data : ExtractVectorsTys) {
@@ -6109,10 +6109,11 @@
             auto *EE = cast<ExtractElementInst>(VL[I]);
             CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
                                                   EE->getVectorOperandType(),
-                                                  *getExtractIndex(EE));
+                                                  *getExtractIndex(EE), EE);
           } else {
-            CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
-                                                  VecTy, Idx);
+            CommonCost -=
+                TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx,
+                                        dyn_cast<Instruction>(VL[I]));
             ++Idx;
           }
         }
@@ -6122,11 +6123,12 @@
             auto *EE = cast<ExtractElementInst>(V);
             CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement,
                                                   EE->getVectorOperandType(),
-                                                  *getExtractIndex(EE));
+                                                  *getExtractIndex(EE), EE);
           } else {
             --Idx;
-            CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement,
-                                                  VecTy, Idx);
+            CommonCost +=
+                TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx,
+                                        dyn_cast<Instruction>(V));
           }
         }
       }
@@ -6150,8 +6152,8 @@
               continue;
             }
           }
-          CommonCost -=
-              TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
+          CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
+                                                VecTy, I, EI);
         }
       } else {
         AdjustExtractsCost(CommonCost);
@@ -7142,8 +7144,8 @@
       ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
                                                    VecTy, EU.Lane);
     } else {
-      ExtractCost +=
-          TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+      ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
+                                             EU.Lane, nullptr);
     }
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -271,9 +271,9 @@
   Type *VecTy = Ext0->getVectorOperand()->getType();
   assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
   InstructionCost Cost0 =
-      TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
+      TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0, Ext0);
   InstructionCost Cost1 =
-      TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
+      TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1, Ext1);
 
   // If both costs are invalid no shuffle is needed
   if (!Cost0.isValid() && !Cost1.isValid())
@@ -337,10 +337,10 @@
   unsigned Ext0Index = Ext0IndexC->getZExtValue();
   unsigned Ext1Index = Ext1IndexC->getZExtValue();
 
-  InstructionCost Extract0Cost =
-      TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index);
-  InstructionCost Extract1Cost =
-      TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext1Index);
+  InstructionCost Extract0Cost = TTI.getVectorInstrCost(
+      Instruction::ExtractElement, VecTy, Ext0Index, Ext0);
+  InstructionCost Extract1Cost = TTI.getVectorInstrCost(
+      Instruction::ExtractElement, VecTy, Ext1Index, Ext1);
 
   // A more expensive extract will always be replaced by a splat shuffle.
   // For example, if Ext0 is more expensive:
@@ -665,7 +665,7 @@
   // Get cost estimate for the insert element. This cost will factor into
   // both sequences.
   InstructionCost InsertCost =
-      TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index);
+      TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index, &I);
   InstructionCost OldCost =
       (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost;
   InstructionCost NewCost = ScalarOpCost + InsertCost +
@@ -755,8 +755,8 @@
     return false;
 
   InstructionCost OldCost =
-      TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
-  OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
+      TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0, Ext0);
+  OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1, Ext1);
   OldCost +=
       TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(),
                              CmpInst::makeCmpResultType(I0->getType()), Pred) *
@@ -776,7 +776,7 @@
   NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy,
                                 ShufMask);
   NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy);
-  NewCost += TTI.getVectorInstrCost(Ext0->getOpcode(), CmpTy, CheapIndex);
+  NewCost += TTI.getVectorInstrCost(Ext0->getOpcode(), CmpTy, CheapIndex, Ext0);
 
   // Aggressively form vector ops if the cost is equal because the transform
   // may enable further optimization.
@@ -1036,7 +1036,7 @@
     auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1));
     OriginalCost +=
         TTI.getVectorInstrCost(Instruction::ExtractElement, LI->getType(),
-                               Index ? Index->getZExtValue() : -1);
+                               Index ? Index->getZExtValue() : -1, LI);
     ScalarizedCost +=
         TTI.getMemoryOpCost(Instruction::Load, FixedVT->getElementType(),
                             Align(1), LI->getPointerAddressSpace());
diff --git a/llvm/test/Analysis/CostModel/AArch64/kryo.ll b/llvm/test/Analysis/CostModel/AArch64/kryo.ll
--- a/llvm/test/Analysis/CostModel/AArch64/kryo.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/kryo.ll
@@ -21,26 +21,22 @@
     ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 1
     %t3 = insertelement <2 x i64> undef, i64 undef, i32 0
     %t4 = insertelement <2 x i64> undef, i64 undef, i32 1
-
     ret void
 }
 
 ; CHECK-LABEL: vectorInstrExtractCost
 define i64 @vectorInstrExtractCost(<4 x i64> %vecreg) {
-    
-    ; Vector extracts - extracting each element at index 0 is considered
-    ; free in the current implementation. When extracting element at index
-    ; 2, 2 is rounded to 0, so extracting element at index 2 has cost 0 as 
-    ; well.
-    ;
     ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 1
-    ; CHECK: cost of 0 {{.*}} extractelement <4 x i64> %vecreg, i32 2
+    ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 2
     %t1 = extractelement <4 x i64> %vecreg, i32 1
     %t2 = extractelement <4 x i64> %vecreg, i32 2
     %ele = add i64 %t2, 1
     %cond = icmp eq i64 %t1, %ele
 
-    ; CHECK: cost of 0 {{.*}} extractelement <4 x i64> %vecreg, i32 0
+    ; Vector extracts - extracting each element should have a cost
+    ; if they are used as integers.
+    ;
+    ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 0
     ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 3
     %t0 = extractelement <4 x i64> %vecreg, i32 0
     %t3 = extractelement <4 x i64> %vecreg, i32 3
diff --git a/llvm/test/Transforms/LICM/AArch64/extract-element.ll b/llvm/test/Transforms/LICM/AArch64/extract-element.ll
--- a/llvm/test/Transforms/LICM/AArch64/extract-element.ll
+++ b/llvm/test/Transforms/LICM/AArch64/extract-element.ll
@@ -18,24 +18,23 @@
 ; CHECK-NEXT:    [[TMP12]] = add i64 [[TMP4]], 1
 ; CHECK-NEXT:    br label [[TMP3]]
 ; CHECK:       .split.loop.exit:
-; CHECK-NEXT:    [[DOTLCSSA7:%.*]] = phi <1 x i64> [ [[TMP8]], [[TMP6]] ]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i64 [ [[TMP9]], [[TMP6]] ]
 ; CHECK-NEXT:    [[DOTLCSSA6:%.*]] = phi i64 [ [[TMP4]], [[TMP6]] ]
 ; CHECK-NEXT:    [[DOTPH:%.*]] = phi i1 [ [[TMP5]], [[TMP6]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[DOTLCSSA7]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], -1
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[DOTLCSSA6]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp uge i64 [[TMP15]], [[TMP1]]
-; CHECK-NEXT:    br label [[TMP17:%.*]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[DOTLCSSA]], -1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP13]], [[DOTLCSSA6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp uge i64 [[TMP14]], [[TMP1]]
+; CHECK-NEXT:    br label [[TMP16:%.*]]
 ; CHECK:       .split.loop.exit2:
 ; CHECK-NEXT:    [[DOTPH3:%.*]] = phi i1 [ [[TMP5]], [[TMP3]] ]
 ; CHECK-NEXT:    [[DOTPH4:%.*]] = phi i1 [ undef, [[TMP3]] ]
-; CHECK-NEXT:    br label [[TMP17]]
-; CHECK:       17:
-; CHECK-NEXT:    [[TMP18:%.*]] = phi i1 [ [[DOTPH]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH3]], [[DOTSPLIT_LOOP_EXIT2]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = phi i1 [ [[TMP16]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH4]], [[DOTSPLIT_LOOP_EXIT2]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = xor i1 [[TMP18]], true
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i1 true, i1 [[TMP19]]
-; CHECK-NEXT:    ret i1 [[TMP21]]
+; CHECK-NEXT:    br label [[TMP16]]
+; CHECK:       16:
+; CHECK-NEXT:    [[TMP17:%.*]] = phi i1 [ [[DOTPH]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH3]], [[DOTSPLIT_LOOP_EXIT2]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = phi i1 [ [[TMP15]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH4]], [[DOTSPLIT_LOOP_EXIT2]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = xor i1 [[TMP17]], true
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i1 true, i1 [[TMP18]]
+; CHECK-NEXT:    ret i1 [[TMP20]]
 ;
   br label %3