diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -40,6 +40,7 @@
 
 #include "X86TargetTransformInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/CodeGen/CostTable.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -3810,61 +3811,102 @@
   auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
   auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
 
-  int NumDstElements = VF * ReplicationFactor;
-  auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
-  auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
+  // What is our naive guess at how many elements the replication will produce?
+  // Some vectors may end up being duplicate or not demanded,
+  // so  we may end up producing less vectors (and thus, elements) than that.
+  int NaiveNumDstElements = VF * ReplicationFactor;
+  auto *PromNaiveDstVecTy =
+      FixedVectorType::get(PromEltTy, NaiveNumDstElements);
 
   // Legalize the types.
   MVT LegalSrcVecTy = TLI->getTypeLegalizationCost(DL, SrcVecTy).second;
   MVT LegalPromSrcVecTy = TLI->getTypeLegalizationCost(DL, PromSrcVecTy).second;
-  MVT LegalPromDstVecTy = TLI->getTypeLegalizationCost(DL, PromDstVecTy).second;
-  MVT LegalDstVecTy = TLI->getTypeLegalizationCost(DL, DstVecTy).second;
+  MVT LegalPromNaiveDstVecTy =
+      TLI->getTypeLegalizationCost(DL, PromNaiveDstVecTy).second;
   // They should have legalized into vector types.
   if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
-      !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
+      !LegalPromNaiveDstVecTy.isVector())
     return bailout();
 
+  auto *SinglePromNaiveDstVecTy =
+      FixedVectorType::get(PromNaiveDstVecTy->getElementType(),
+                           LegalPromNaiveDstVecTy.getVectorNumElements());
+
+  ArrayRef<int> PrevSubMask;
+  ArrayRef<int> CurrSubMask;
+
+  auto IsNewUniqueSubMask = [&PrevSubMask, &CurrSubMask]() {
+    assert(!CurrSubMask.empty() && "The current submask is never empty.");
+    // Is this the first submask we are looking at?
+    if (PrevSubMask.empty())
+      return true;
+    assert(CurrSubMask.size() <= PrevSubMask.size() &&
+           "The mask size will at most decrease (once, for the final mask).");
+    return CurrSubMask != PrevSubMask.take_front(CurrSubMask.size());
+  };
+
+  const auto ReplicatedMask = createReplicatedMask(ReplicationFactor, VF);
+  ArrayRef<int> RemainingMask = ReplicatedMask;
+
+  auto ShouldCurrSubMaskBeDemanded = [DemandedDstElts, &RemainingMask,
+                                      &CurrSubMask,
+                                      ReplicatedMaskSize =
+                                          ReplicatedMask.size()]() {
+    size_t ProcessedMaskSize = ReplicatedMaskSize - RemainingMask.size();
+    return !DemandedDstElts.extractBits(CurrSubMask.size(), ProcessedMaskSize)
+                .isZero();
+  };
+
+  unsigned NumDemandedUniqueSubMasks = 0;
+  bool IsCurrUniqueSubMaskDemanded = false;
+  int NumDstElements = 0;
+
+  while (!RemainingMask.empty()) {
+    size_t CurrMaskSize = std::min<size_t>(
+        SinglePromNaiveDstVecTy->getNumElements(), RemainingMask.size());
+    assert(CurrMaskSize != 0 && "The current submask is never empty.");
+    CurrSubMask = RemainingMask.take_front(CurrMaskSize);
+
+    if (IsNewUniqueSubMask()) {
+      IsCurrUniqueSubMaskDemanded = false;
+      PrevSubMask = CurrSubMask;
+    }
+
+    if (!IsCurrUniqueSubMaskDemanded) {
+      IsCurrUniqueSubMaskDemanded = ShouldCurrSubMaskBeDemanded();
+      if (IsCurrUniqueSubMaskDemanded) {
+        ++NumDemandedUniqueSubMasks;
+        NumDstElements += CurrMaskSize;
+      }
+    }
+
+    RemainingMask = RemainingMask.drop_front(CurrSubMask.size());
+  }
+
+  auto *PromRealDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
+  auto *RealDstVecTy = FixedVectorType::get(EltTy, NumDstElements);
+
+  InstructionCost CastCost;
+
   if (PromEltTyBits != EltTyBits) {
     // If we have to perform the shuffle with wider elt type than our data type,
     // then we will first need to anyext (we don't care about the new bits)
     // the source elements, and then truncate Dst elements.
-    InstructionCost PromotionCost;
-    PromotionCost += getCastInstrCost(
+    CastCost += getCastInstrCost(
         Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
         TargetTransformInfo::CastContextHint::None, CostKind);
-    PromotionCost +=
-        getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
-                         /*Src=*/PromDstVecTy,
+    CastCost +=
+        getCastInstrCost(Instruction::Trunc, /*Dst=*/RealDstVecTy,
+                         /*Src=*/PromRealDstVecTy,
                          TargetTransformInfo::CastContextHint::None, CostKind);
-    return PromotionCost + getReplicationShuffleCost(PromEltTy,
-                                                     ReplicationFactor, VF,
-                                                     DemandedDstElts, CostKind);
   }
 
-  assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
-         LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
-         "We expect that the legalization doesn't affect the element width, "
-         "doesn't coalesce/split elements.");
-
-  unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
-  unsigned NumDstVectors =
-      divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
-
-  auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
-
-  // Not all the produced Dst elements may be demanded. In our case,
-  // given that a single Dst vector is formed by a single shuffle,
-  // if all elements that will form a single Dst vector aren't demanded,
-  // then we won't need to do that shuffle, so adjust the cost accordingly.
-  APInt DemandedDstVectors = APIntOps::ScaleBitMask(
-      DemandedDstElts.zextOrSelf(NumDstVectors * NumEltsPerDstVec),
-      NumDstVectors);
-  unsigned NumDstVectorsDemanded = DemandedDstVectors.countPopulation();
-
   InstructionCost SingleShuffleCost =
-      getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy,
+      getShuffleCost(TTI::SK_PermuteSingleSrc, SinglePromNaiveDstVecTy,
                      /*Mask=*/None, /*Index=*/0, /*SubTp=*/nullptr);
-  return NumDstVectorsDemanded * SingleShuffleCost;
+  InstructionCost ShuffleCost = NumDemandedUniqueSubMasks * SingleShuffleCost;
+
+  return CastCost + ShuffleCost;
 }
 
 InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i64.ll b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i64.ll
--- a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i64.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i64.ll
@@ -380,7 +380,7 @@
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i64_stride7'
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
@@ -444,10 +444,10 @@
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i64_stride8'
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>