diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1169,6 +1169,18 @@
     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
   }
 
+  /// Estimate cost of an intrinsic call instruction CI if it were vectorized
+  /// with factor VF.  Return the cost of the instruction, including
+  /// scalarization overhead if it's needed.
+  unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
+
+  /// Estimate cost of a call instruction CI if it were vectorized with factor
+  /// VF. Return the cost of the instruction, including scalarization overhead
+  /// if it's needed. The flag NeedToScalarize shows if the call needs to be
+  /// scalarized -
+  /// i.e. either vector version isn't available, or is too expensive.
+  unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
+
 private:
   unsigned NumPredStores = 0;
 
@@ -1221,6 +1233,10 @@
   /// element)
   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
 
+  /// Estimate the overhead of scalarizing an instruction. This is a
+  /// convenience wrapper for the type-based getScalarizationOverhead API.
+  unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
+
   /// Returns whether the instruction is a load or store and will be a emitted
   /// as a vector operation.
   bool isConsecutiveLoadOrStore(Instruction *I);
@@ -1315,6 +1331,28 @@
 
   DecisionList WideningDecisions;
 
+  /// Returns true if \p V is expected to be vectorized and it needs to be
+  /// extracted.
+  bool needsExtract(Value *V, unsigned VF) const {
+    // Assume we can vectorize V (and hence we need extraction) if the
+    // scalars are not computed yet. This can happen, because it is called
+    // via getScalarizationOverhead from setCostBasedWideningDecision, before
+    // the scalars are collected.
+    if (VF > 1 && Scalars.find(VF) == Scalars.end())
+      return true;
+
+    Instruction *I = dyn_cast<Instruction>(V);
+    return I && TheLoop->contains(I) && !TheLoop->isLoopInvariant(I) &&
+           !isScalarAfterVectorization(I, VF);
+  };
+
+  /// Returns a range containing only operands needing to be extracted.
+  SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
+                                                   unsigned VF) {
+    return SmallVector<Value *, 4>(make_filter_range(
+        Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
+  }
+
 public:
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -3057,45 +3095,9 @@
   }
 }
 
-/// Estimate the overhead of scalarizing an instruction. This is a
-/// convenience wrapper for the type-based getScalarizationOverhead API.
-static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
-                                         const TargetTransformInfo &TTI) {
-  if (VF == 1)
-    return 0;
-
-  unsigned Cost = 0;
-  Type *RetTy = ToVectorTy(I->getType(), VF);
-  if (!RetTy->isVoidTy() &&
-      (!isa<LoadInst>(I) ||
-       !TTI.supportsEfficientVectorElementLoadStore()))
-    Cost += TTI.getScalarizationOverhead(RetTy, true, false);
-
-  // Some targets keep addresses scalar.
-  if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
-    return Cost;
-
-  if (CallInst *CI = dyn_cast<CallInst>(I)) {
-    SmallVector<const Value *, 4> Operands(CI->arg_operands());
-    Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
-  }
-  else if (!isa<StoreInst>(I) ||
-           !TTI.supportsEfficientVectorElementLoadStore()) {
-    SmallVector<const Value *, 4> Operands(I->operand_values());
-    Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
-  }
-
-  return Cost;
-}
-
-// Estimate cost of a call instruction CI if it were vectorized with factor VF.
-// Return the cost of the instruction, including scalarization overhead if it's
-// needed. The flag NeedToScalarize shows if the call needs to be scalarized -
-// i.e. either vector version isn't available, or is too expensive.
-static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
-                                  const TargetTransformInfo &TTI,
-                                  const TargetLibraryInfo *TLI,
-                                  bool &NeedToScalarize) {
+unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
+                                                       unsigned VF,
+                                                       bool &NeedToScalarize) {
   Function *F = CI->getCalledFunction();
   StringRef FnName = CI->getCalledFunction()->getName();
   Type *ScalarRetTy = CI->getType();
@@ -3118,7 +3120,7 @@
 
   // Compute costs of unpacking argument values for the scalar calls and
   // packing the return values to a vector.
-  unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI);
+  unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
 
   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
 
@@ -3137,12 +3139,8 @@
   return Cost;
 }
 
-// Estimate cost of an intrinsic call instruction CI if it were vectorized with
-// factor VF.  Return the cost of the instruction, including scalarization
-// overhead if it's needed.
-static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
-                                       const TargetTransformInfo &TTI,
-                                       const TargetLibraryInfo *TLI) {
+unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
+                                                            unsigned VF) {
   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
   assert(ID && "Expected intrinsic call!");
 
@@ -3150,8 +3148,11 @@
   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
     FMF = FPMO->getFastMathFlags();
 
-  SmallVector<Value *, 4> Operands(CI->arg_operands());
-  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
+  // Skip operands that do not require extraction/scalarization and do not incur
+  // any overhead.
+  return TTI.getIntrinsicInstrCost(
+      ID, CI->getType(), filterExtractingOperands(CI->arg_operands(), VF), FMF,
+      VF);
 }
 
 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
@@ -4126,9 +4127,9 @@
     // version of the instruction.
     // Is it beneficial to perform intrinsic call compared to lib call?
     bool NeedToScalarize;
-    unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
+    unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
     bool UseVectorIntrinsic =
-        ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
+        ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
     assert((UseVectorIntrinsic || !NeedToScalarize) &&
            "Instruction should be scalarized elsewhere.");
 
@@ -5352,15 +5353,6 @@
     return true;
   };
 
-  // Returns true if an operand that cannot be scalarized must be extracted
-  // from a vector. We will account for this scalarization overhead below. Note
-  // that the non-void predicated instructions are placed in their own blocks,
-  // and their return values are inserted into vectors. Thus, an extract would
-  // still be required.
-  auto needsExtract = [&](Instruction *I) -> bool {
-    return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF);
-  };
-
   // Compute the expected cost discount from scalarizing the entire expression
   // feeding the predicated instruction. We currently only consider expressions
   // that are single-use instruction chains.
@@ -5400,7 +5392,7 @@
                "Instruction has non-scalar type");
         if (canBeScalarized(J))
           Worklist.push_back(J);
-        else if (needsExtract(J))
+        else if (needsExtract(J, VF))
           ScalarCost += TTI.getScalarizationOverhead(
                               ToVectorTy(J->getType(),VF), false, true);
       }
@@ -5522,7 +5514,7 @@
 
   // Get the overhead of the extractelement and insertelement instructions
   // we might create due to scalarization.
-  Cost += getScalarizationOverhead(I, VF, TTI);
+  Cost += getScalarizationOverhead(I, VF);
 
   // If we have a predicated store, it may not be executed for each vector
   // lane. Scale the cost by the probability of executing the predicated
@@ -5674,6 +5666,36 @@
   return VectorizationCostTy(C, TypeNotScalarized);
 }
 
+unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
+                                                              unsigned VF) {
+
+  if (VF == 1)
+    return 0;
+
+  unsigned Cost = 0;
+  Type *RetTy = ToVectorTy(I->getType(), VF);
+  if (!RetTy->isVoidTy() &&
+      (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
+    Cost += TTI.getScalarizationOverhead(RetTy, true, false);
+
+  // Some targets keep addresses scalar.
+  if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
+    return Cost;
+
+  // Some targets support efficient element stores.
+  if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
+    return Cost;
+
+  // Collect operands to consider.
+  CallInst *CI = dyn_cast<CallInst>(I);
+  Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
+
+  // Skip operands that do not require extraction/scalarization and do not incur
+  // any overhead.
+  return Cost + TTI.getOperandsScalarizationOverhead(
+                    filterExtractingOperands(Ops, VF), VF);
+}
+
 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
   if (VF == 1)
     return;
@@ -5914,7 +5936,7 @@
 
       // The cost of insertelement and extractelement instructions needed for
       // scalarization.
-      Cost += getScalarizationOverhead(I, VF, TTI);
+      Cost += getScalarizationOverhead(I, VF);
 
       // Scale the cost by the probability of executing the predicated blocks.
       // This assumes the predicated block for each vector lane is equally
@@ -6035,16 +6057,16 @@
   case Instruction::Call: {
     bool NeedToScalarize;
     CallInst *CI = cast<CallInst>(I);
-    unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize);
+    unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
     if (getVectorIntrinsicIDForCall(CI, TLI))
-      return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
+      return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
     return CallCost;
   }
   default:
     // The cost of executing VF copies of the scalar instruction. This opcode
     // is unknown. Assume that it is the same as 'mul'.
     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
-           getScalarizationOverhead(I, VF, TTI);
+           getScalarizationOverhead(I, VF);
   } // end of switch.
 }
 
@@ -6638,9 +6660,9 @@
       // version of the instruction.
       // Is it beneficial to perform intrinsic call compared to lib call?
       bool NeedToScalarize;
-      unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
+      unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
       bool UseVectorIntrinsic =
-          ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
+          ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
       return UseVectorIntrinsic || !NeedToScalarize;
     }
     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
@@ -6828,7 +6850,7 @@
   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
   auto Plan = llvm::make_unique<VPlan>(VPBB);
 
-  VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, TTI, Legal, CM, Builder);
+  VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
   // Represent values that will have defs inside VPlan.
   for (Value *V : NeedDef)
     Plan->addVPValue(V);
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -29,9 +29,6 @@
   /// Target Library Info.
   const TargetLibraryInfo *TLI;
 
-  /// Target Transform Info.
-  const TargetTransformInfo *TTI;
-
   /// The legality analysis.
   LoopVectorizationLegality *Legal;
 
@@ -104,11 +101,9 @@
 
 public:
   VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI,
-                  const TargetTransformInfo *TTI,
                   LoopVectorizationLegality *Legal,
                   LoopVectorizationCostModel &CM, VPBuilder &Builder)
-      : OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM),
-        Builder(Builder) {}
+      : OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), Builder(Builder) {}
 
   /// Check if a recipe can be create for \p I withing the given VF \p Range.
   /// If a recipe can be created, it adds it to \p VPBB.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
@@ -0,0 +1,109 @@
+; REQUIRES: asserts
+
+; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios %s -S -debug -disable-output 2>&1 | FileCheck --check-prefix=CM %s
+; RUN: opt -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 %s -S | FileCheck --check-prefix=FORCED %s
+
+; Test case from PR41294.
+
+; Check scalar cost for extractvalue. The constant and loop invariant operands are free,
+; leaving cost 3 for scalarizing the result + 2 for executing the op with VF 2.
+
+; CM: LV: Scalar loop costs: 7.
+; CM: LV: Found an estimated cost of 5 for VF 2 For instruction:   %a = extractvalue { i64, i64 } %sv, 0
+; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction:   %b = extractvalue { i64, i64 } %sv, 1
+
+; Check that the extractvalue operands are actually free in vector code.
+
+; FORCED-LABEL: vector.body:                                      ; preds = %vector.body, %vector.ph
+; FORCED-NEXT:    %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; FORCED-NEXT:    %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0
+; FORCED-NEXT:    %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer
+; FORCED-NEXT:    %induction = add <2 x i32> %broadcast.splat, <i32 0, i32 1>
+; FORCED-NEXT:    %0 = add i32 %index, 0
+; FORCED-NEXT:    %1 = extractvalue { i64, i64 } %sv, 0
+; FORCED-NEXT:    %2 = extractvalue { i64, i64 } %sv, 0
+; FORCED-NEXT:    %3 = insertelement <2 x i64> undef, i64 %1, i32 0
+; FORCED-NEXT:    %4 = insertelement <2 x i64> %3, i64 %2, i32 1
+; FORCED-NEXT:    %5 = extractvalue { i64, i64 } %sv, 1
+; FORCED-NEXT:    %6 = extractvalue { i64, i64 } %sv, 1
+; FORCED-NEXT:    %7 = insertelement <2 x i64> undef, i64 %5, i32 0
+; FORCED-NEXT:    %8 = insertelement <2 x i64> %7, i64 %6, i32 1
+; FORCED-NEXT:    %9 = getelementptr i64, i64* %dst, i32 %0
+; FORCED-NEXT:    %10 = add <2 x i64> %4, %8
+; FORCED-NEXT:    %11 = getelementptr i64, i64* %9, i32 0
+; FORCED-NEXT:    %12 = bitcast i64* %11 to <2 x i64>*
+; FORCED-NEXT:    store <2 x i64> %10, <2 x i64>* %12, align 4
+; FORCED-NEXT:    %index.next = add i32 %index, 2
+; FORCED-NEXT:    %13 = icmp eq i32 %index.next, 0
+; FORCED-NEXT:    br i1 %13, label %middle.block, label %vector.body, !llvm.loop !0
+
+define void @test1(i64* %dst, {i64, i64} %sv) {
+entry:
+  br label %loop.body
+
+loop.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.body ]
+  %a = extractvalue { i64, i64 } %sv, 0
+  %b = extractvalue { i64, i64 } %sv, 1
+  %addr = getelementptr i64, i64* %dst, i32 %iv
+  %add = add i64 %a, %b
+  store i64 %add, i64* %addr
+  %iv.next = add nsw i32 %iv, 1
+  %cond = icmp ne i32 %iv.next, 0
+  br i1 %cond, label %loop.body, label %exit
+
+exit:
+  ret void
+}
+
+
+; Similar to the test case above, but checks getVectorCallCost as well.
+declare float @pow(float, float) readnone nounwind
+
+; CM: LV: Scalar loop costs: 16.
+; CM: LV: Found an estimated cost of 5 for VF 2 For instruction:   %a = extractvalue { float, float } %sv, 0
+; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction:   %b = extractvalue { float, float } %sv, 1
+
+; FORCED-LABEL: define void @test_getVectorCallCost
+
+; FORCED-LABEL: vector.body:                                      ; preds = %vector.body, %vector.ph
+; FORCED-NEXT:    %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; FORCED-NEXT:    %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0
+; FORCED-NEXT:    %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer
+; FORCED-NEXT:    %induction = add <2 x i32> %broadcast.splat, <i32 0, i32 1>
+; FORCED-NEXT:    %0 = add i32 %index, 0
+; FORCED-NEXT:    %1 = extractvalue { float, float } %sv, 0
+; FORCED-NEXT:    %2 = extractvalue { float, float } %sv, 0
+; FORCED-NEXT:    %3 = insertelement <2 x float> undef, float %1, i32 0
+; FORCED-NEXT:    %4 = insertelement <2 x float> %3, float %2, i32 1
+; FORCED-NEXT:    %5 = extractvalue { float, float } %sv, 1
+; FORCED-NEXT:    %6 = extractvalue { float, float } %sv, 1
+; FORCED-NEXT:    %7 = insertelement <2 x float> undef, float %5, i32 0
+; FORCED-NEXT:    %8 = insertelement <2 x float> %7, float %6, i32 1
+; FORCED-NEXT:    %9 = getelementptr float, float* %dst, i32 %0
+; FORCED-NEXT:    %10 = call <2 x float> @llvm.pow.v2f32(<2 x float> %4, <2 x float> %8)
+; FORCED-NEXT:    %11 = getelementptr float, float* %9, i32 0
+; FORCED-NEXT:    %12 = bitcast float* %11 to <2 x float>*
+; FORCED-NEXT:    store <2 x float> %10, <2 x float>* %12, align 4
+; FORCED-NEXT:    %index.next = add i32 %index, 2
+; FORCED-NEXT:    %13 = icmp eq i32 %index.next, 0
+; FORCED-NEXT:    br i1 %13, label %middle.block, label %vector.body, !llvm.loop !4
+
+define void @test_getVectorCallCost(float* %dst, {float, float} %sv) {
+entry:
+  br label %loop.body
+
+loop.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.body ]
+  %a = extractvalue { float, float } %sv, 0
+  %b = extractvalue { float, float } %sv, 1
+  %addr = getelementptr float, float* %dst, i32 %iv
+  %p = call float @pow(float %a, float %b)
+  store float %p, float* %addr
+  %iv.next = add nsw i32 %iv, 1
+  %cond = icmp ne i32 %iv.next, 0
+  br i1 %cond, label %loop.body, label %exit
+
+exit:
+  ret void
+}