diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6960,40 +6960,10 @@
   Type *RetTy = I->getType();
   if (canTruncateToMinimalBitwidth(I, VF))
     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
+  VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
   auto SE = PSE.getSE();
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
-  auto hasSingleCopyAfterVectorization = [this](Instruction *I,
-                                                ElementCount VF) -> bool {
-    if (VF.isScalar())
-      return true;
-
-    auto Scalarized = InstsToScalarize.find(VF);
-    assert(Scalarized != InstsToScalarize.end() &&
-           "VF not yet analyzed for scalarization profitability");
-    return !Scalarized->second.count(I) &&
-           llvm::all_of(I->users(), [&](User *U) {
-             auto *UI = cast<Instruction>(U);
-             return !Scalarized->second.count(UI);
-           });
-  };
-  (void) hasSingleCopyAfterVectorization;
-
-  if (isScalarAfterVectorization(I, VF)) {
-    // With the exception of GEPs and PHIs, after scalarization there should
-    // only be one copy of the instruction generated in the loop. This is
-    // because the VF is either 1, or any instructions that need scalarizing
-    // have already been dealt with by the the time we get here. As a result,
-    // it means we don't have to multiply the instruction cost by VF.
-    assert(I->getOpcode() == Instruction::GetElementPtr ||
-           I->getOpcode() == Instruction::PHI ||
-           (I->getOpcode() == Instruction::BitCast &&
-            I->getType()->isPointerTy()) ||
-           hasSingleCopyAfterVectorization(I, VF));
-    VectorTy = RetTy;
-  } else
-    VectorTy = ToVectorTy(RetTy, VF);
-
   // TODO: We need to estimate the cost of intrinsic calls.
   switch (I->getOpcode()) {
   case Instruction::GetElementPtr:
@@ -7120,15 +7090,20 @@
       Op2VK = TargetTransformInfo::OK_UniformValue;
 
     SmallVector<const Value *, 4> Operands(I->operand_values());
-    return TTI.getArithmeticInstrCost(
-        I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
-        Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
+    unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
+    return N * TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind,
+                                          TargetTransformInfo::OK_AnyValue,
+                                          Op2VK, TargetTransformInfo::OP_None,
+                                          Op2VP, Operands, I);
   }
   case Instruction::FNeg: {
-    return TTI.getArithmeticInstrCost(
-        I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
-        TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
-        TargetTransformInfo::OP_None, I->getOperand(0), I);
+    unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
+    return N * TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind,
+                                          TargetTransformInfo::OK_AnyValue,
+                                          TargetTransformInfo::OK_AnyValue,
+                                          TargetTransformInfo::OP_None,
+                                          TargetTransformInfo::OP_None,
+                                          I->getOperand(0), I);
   }
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
@@ -7188,10 +7163,6 @@
     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
     return getMemoryInstructionCost(I, VF);
   }
-  case Instruction::BitCast:
-    if (I->getType()->isPointerTy())
-      return 0;
-    LLVM_FALLTHROUGH;
   case Instruction::ZExt:
   case Instruction::SExt:
   case Instruction::FPToUI:
@@ -7202,7 +7173,8 @@
   case Instruction::SIToFP:
   case Instruction::UIToFP:
   case Instruction::Trunc:
-  case Instruction::FPTrunc: {
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
     // Computes the CastContextHint from a Load/Store instruction.
     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
@@ -7278,7 +7250,14 @@
       }
     }
 
-    return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
+    unsigned N;
+    if (isScalarAfterVectorization(I, VF)) {
+      assert(!VF.isScalable() && "VF is assumed to be non scalable");
+      N = VF.getKnownMinValue();
+    } else
+      N = 1;
+    return N *
+           TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
   }
   case Instruction::Call: {
     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
@@ -7302,8 +7281,11 @@
       return InstructionCost::getInvalid();
     LLVM_FALLTHROUGH;
   default:
-    // This opcode is unknown. Assume that it is the same as 'mul'.
-    return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
+    // The cost of executing VF copies of the scalar instruction. This opcode
+    // is unknown. Assume that it is the same as 'mul'.
+    return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
+                                       Instruction::Mul, VectorTy, CostKind) +
+           getScalarizationOverhead(I, VF);
   } // end of switch.
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
@@ -6,7 +6,7 @@
 
 ; CHECK-LABEL: all_scalar
 ; CHECK:       LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
-; CHECK:       LV: Found an estimated cost of 1 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK:       LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
 ; CHECK:       LV: Not considering vector loop of width 2 because it will not generate any vector instructions
 ;
 define void @all_scalar(i64* %a, i64 %n) {
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll b/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll
@@ -1,13 +1,44 @@
 ; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s
 
-; REQUIRES: asserts
-; XFAIL: *
-
 target triple = "x86_64-apple-macosx"
 
-; CHECK: vector.body
-
 define void @test_pr55096(i64 %c, ptr %p) {
+; CHECK-LABEL: @test_pr55096(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %pred.store.continue3 ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 122, i64 123>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %pred.store.continue3 ]
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDEX]] to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i16 [[TMP0]], 2008
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i16 6229, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult <2 x i64> [[VEC_IND]], {{.*}}
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <2 x i1> [[TMP2]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
+; CHECK-NEXT:    br i1 [[TMP4]], label %pred.store.if, label %pred.store.continue
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP5:%.*]] = add i16 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = add i16 [[TMP5]], 2008
+; CHECK-NEXT:    [[TMP7:%.*]] = udiv i16 4943, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i16 [[TMP7]]
+; CHECK-NEXT:    store i16 0, ptr [[TMP8]], align 2
+; CHECK-NEXT:    br label %pred.store.continue
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i16 [ poison, %vector.body ], [ [[TMP7]], %pred.store.if ]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
+; CHECK-NEXT:    br i1 [[TMP10]], label %pred.store.if2, label %pred.store.continue3
+; CHECK:       pred.store.if2:
+; CHECK-NEXT:    [[TMP11:%.*]] = add i16 [[OFFSET_IDX]], 2008
+; CHECK-NEXT:    [[TMP12:%.*]] = add i16 [[TMP11]], 2008
+; CHECK-NEXT:    [[TMP13:%.*]] = udiv i16 4943, [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[P]], i16 [[TMP13]]
+; CHECK-NEXT:    store i16 0, ptr [[TMP14]], align 2
+; CHECK-NEXT:    br label %pred.store.continue3
+; CHECK:       pred.store.continue3:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi i16 [ poison, %pred.store.continue ], [ [[TMP13]], %pred.store.if2 ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 340
+; CHECK-NEXT:    br i1 [[TMP16]], label %middle.block, label %vector.body
+;
 entry:
   br label %loop.header