diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -103,7 +103,7 @@
 #include "AArch64GenSubtargetInfo.inc"
 
   uint8_t MaxInterleaveFactor = 2;
-  uint8_t VectorInsertExtractBaseCost = 3;
+  uint8_t VectorInsertExtractBaseCost = 2;
   uint16_t CacheLineSize = 0;
   uint16_t PrefetchDistance = 0;
   uint16_t MinPrefetchStride = 1;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -385,6 +385,11 @@
                                  VectorType *SubTp,
                                  ArrayRef<const Value *> Args = std::nullopt);
 
+  InstructionCost getScalarizationOverhead(VectorType *Ty,
+                                           const APInt &DemandedElts,
+                                           bool Insert, bool Extract,
+                                           TTI::TargetCostKind CostKind);
+
   /// Return the cost of the scaling factor used in the addressing
   /// mode represented by AM for this target, for a load/store
   /// of the specified type.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2560,6 +2560,18 @@
   return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */);
 }
 
+InstructionCost AArch64TTIImpl::getScalarizationOverhead(
+    VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
+    TTI::TargetCostKind CostKind) {
+  if (isa<ScalableVectorType>(Ty))
+    return InstructionCost::getInvalid();
+  if (Ty->getElementType()->isFloatingPointTy())
+    return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
+                                           CostKind);
+  return DemandedElts.popcount() * (Insert + Extract) *
+         ST->getVectorInsertExtractBaseCost();
+}
+
 InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
--- a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
@@ -198,16 +198,16 @@
 define i32 @frem(i32 %arg) {
 ; CHECK-LABEL: 'frem'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = frem half undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4F16 = frem <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8F16 = frem <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V16F16 = frem <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F16 = frem <4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8F16 = frem <8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V16F16 = frem <16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = frem <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4F32 = frem <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8F32 = frem <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = frem <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = frem <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8F32 = frem <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = frem <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = frem <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = frem <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F64 = frem <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %F16 = frem half undef, undef
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-overflow.ll b/llvm/test/Analysis/CostModel/AArch64/arith-overflow.ll
--- a/llvm/test/Analysis/CostModel/AArch64/arith-overflow.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-overflow.ll
@@ -355,9 +355,9 @@
 define i32 @smul(i32 %arg) {
 ; RECIP-LABEL: 'smul'
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
@@ -437,9 +437,9 @@
 define i32 @umul(i32 %arg) {
 ; RECIP-LABEL: 'umul'
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/bswap.ll b/llvm/test/Analysis/CostModel/AArch64/bswap.ll
--- a/llvm/test/Analysis/CostModel/AArch64/bswap.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/bswap.ll
@@ -44,7 +44,7 @@
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i32 = call <3 x i32> @llvm.bswap.v3i32(<3 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4i48 = call <4 x i48> @llvm.bswap.v4i48(<4 x i48> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4i48 = call <4 x i48> @llvm.bswap.v4i48(<4 x i48> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v4i16 = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/cast.ll b/llvm/test/Analysis/CostModel/AArch64/cast.ll
--- a/llvm/test/Analysis/CostModel/AArch64/cast.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cast.ll
@@ -947,8 +947,8 @@
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
@@ -967,8 +967,8 @@
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
@@ -987,8 +987,8 @@
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 115 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 115 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
@@ -1373,8 +1373,8 @@
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
@@ -1586,8 +1586,8 @@
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
@@ -3214,9 +3214,9 @@
 
 define void @extend_extract() {
 ; CHECK-LABEL: 'extend_extract'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %e8 = extractelement <8 x i8> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %e16 = extractelement <8 x i16> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %e32 = extractelement <8 x i32> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e8 = extractelement <8 x i8> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e16 = extractelement <8 x i16> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e32 = extractelement <8 x i32> undef, i32 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8_16 = sext i8 %e8 to i16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %z8_16 = zext i8 %e8 to i16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8_32 = sext i8 %e8 to i32
@@ -3233,9 +3233,9 @@
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SVE-LABEL: 'extend_extract'
-; SVE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %e8 = extractelement <8 x i8> undef, i32 1
-; SVE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %e16 = extractelement <8 x i16> undef, i32 1
-; SVE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %e32 = extractelement <8 x i32> undef, i32 1
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e8 = extractelement <8 x i8> undef, i32 1
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e16 = extractelement <8 x i16> undef, i32 1
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e32 = extractelement <8 x i32> undef, i32 1
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8_16 = sext i8 %e8 to i16
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %z8_16 = zext i8 %e8 to i16
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8_32 = sext i8 %e8 to i32
@@ -3292,38 +3292,38 @@
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x half> undef to <2 x i16>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x half> undef to <2 x i32>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x half> undef to <2 x i32>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r110 = fptoui <4 x half> undef to <4 x i1>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r111 = fptosi <4 x half> undef to <4 x i1>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r112 = fptoui <4 x half> undef to <4 x i8>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r113 = fptosi <4 x half> undef to <4 x i8>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r114 = fptoui <4 x half> undef to <4 x i16>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r115 = fptosi <4 x half> undef to <4 x i16>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %r116 = fptoui <4 x half> undef to <4 x i32>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %r117 = fptosi <4 x half> undef to <4 x i32>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r118 = fptoui <4 x half> undef to <4 x i64>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r119 = fptosi <4 x half> undef to <4 x i64>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %r132 = fptoui <8 x half> undef to <8 x i8>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %r133 = fptosi <8 x half> undef to <8 x i8>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r116 = fptoui <4 x half> undef to <4 x i32>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r117 = fptosi <4 x half> undef to <4 x i32>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %r118 = fptoui <4 x half> undef to <4 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %r119 = fptosi <4 x half> undef to <4 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r132 = fptoui <8 x half> undef to <8 x i8>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r133 = fptosi <8 x half> undef to <8 x i8>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r134 = fptoui <8 x half> undef to <8 x i16>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r135 = fptosi <8 x half> undef to <8 x i16>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %r136 = fptoui <8 x half> undef to <8 x i32>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %r137 = fptosi <8 x half> undef to <8 x i32>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r138 = fptoui <8 x half> undef to <8 x i64>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r139 = fptosi <8 x half> undef to <8 x i64>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %r150 = fptoui <16 x half> undef to <16 x i1>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %r151 = fptosi <16 x half> undef to <16 x i1>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %r152 = fptoui <16 x half> undef to <16 x i8>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %r153 = fptosi <16 x half> undef to <16 x i8>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %r136 = fptoui <8 x half> undef to <8 x i32>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %r137 = fptosi <8 x half> undef to <8 x i32>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %r138 = fptoui <8 x half> undef to <8 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %r139 = fptosi <8 x half> undef to <8 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %r150 = fptoui <16 x half> undef to <16 x i1>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %r151 = fptosi <16 x half> undef to <16 x i1>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %r152 = fptoui <16 x half> undef to <16 x i8>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %r153 = fptosi <16 x half> undef to <16 x i8>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r154 = fptoui <16 x half> undef to <16 x i16>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r155 = fptosi <16 x half> undef to <16 x i16>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %r156 = fptoui <16 x half> undef to <16 x i32>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %r157 = fptosi <16 x half> undef to <16 x i32>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %r158 = fptoui <16 x half> undef to <16 x i64>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %r159 = fptosi <16 x half> undef to <16 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r156 = fptoui <16 x half> undef to <16 x i32>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r157 = fptosi <16 x half> undef to <16 x i32>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %r158 = fptoui <16 x half> undef to <16 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %r159 = fptosi <16 x half> undef to <16 x i64>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r250 = uitofp <8 x i1> undef to <8 x half>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r251 = sitofp <8 x i1> undef to <8 x half>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r252 = uitofp <8 x i8> undef to <8 x half>
@@ -3438,8 +3438,8 @@
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x half> undef to <2 x i16>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x half> undef to <2 x i32>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x half> undef to <2 x i32>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r110 = fptoui <4 x half> undef to <4 x i1>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r111 = fptosi <4 x half> undef to <4 x i1>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r112 = fptoui <4 x half> undef to <4 x i8>
@@ -3448,28 +3448,28 @@
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r115 = fptosi <4 x half> undef to <4 x i16>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r116 = fptoui <4 x half> undef to <4 x i32>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r117 = fptosi <4 x half> undef to <4 x i32>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r118 = fptoui <4 x half> undef to <4 x i64>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r119 = fptosi <4 x half> undef to <4 x i64>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %r118 = fptoui <4 x half> undef to <4 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %r119 = fptosi <4 x half> undef to <4 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r132 = fptoui <8 x half> undef to <8 x i8>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r133 = fptosi <8 x half> undef to <8 x i8>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r134 = fptoui <8 x half> undef to <8 x i16>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r135 = fptosi <8 x half> undef to <8 x i16>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r136 = fptoui <8 x half> undef to <8 x i32>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r137 = fptosi <8 x half> undef to <8 x i32>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r138 = fptoui <8 x half> undef to <8 x i64>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r139 = fptosi <8 x half> undef to <8 x i64>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %r150 = fptoui <16 x half> undef to <16 x i1>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %r151 = fptosi <16 x half> undef to <16 x i1>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %r138 = fptoui <8 x half> undef to <8 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %r139 = fptosi <8 x half> undef to <8 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %r150 = fptoui <16 x half> undef to <16 x i1>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %r151 = fptosi <16 x half> undef to <16 x i1>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r152 = fptoui <16 x half> undef to <16 x i8>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r153 = fptosi <16 x half> undef to <16 x i8>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r154 = fptoui <16 x half> undef to <16 x i16>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r155 = fptosi <16 x half> undef to <16 x i16>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r156 = fptoui <16 x half> undef to <16 x i32>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r157 = fptosi <16 x half> undef to <16 x i32>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %r158 = fptoui <16 x half> undef to <16 x i64>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %r159 = fptosi <16 x half> undef to <16 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %r158 = fptoui <16 x half> undef to <16 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %r159 = fptosi <16 x half> undef to <16 x i64>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r250 = uitofp <8 x i1> undef to <8 x half>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r251 = sitofp <8 x i1> undef to <8 x half>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r252 = uitofp <8 x i8> undef to <8 x half>
@@ -3511,8 +3511,8 @@
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x half> undef to <2 x i16>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x half> undef to <2 x i32>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x half> undef to <2 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r110 = fptoui <4 x half> undef to <4 x i1>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r111 = fptosi <4 x half> undef to <4 x i1>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r112 = fptoui <4 x half> undef to <4 x i8>
@@ -3523,8 +3523,8 @@
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r117 = fptosi <4 x half> undef to <4 x i32>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r118 = fptoui <4 x half> undef to <4 x i64>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r119 = fptosi <4 x half> undef to <4 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r132 = fptoui <8 x half> undef to <8 x i8>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r133 = fptosi <8 x half> undef to <8 x i8>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r134 = fptoui <8 x half> undef to <8 x i16>
@@ -3584,8 +3584,8 @@
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x half> undef to <2 x i16>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x half> undef to <2 x i32>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x half> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r110 = fptoui <4 x half> undef to <4 x i1>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r111 = fptosi <4 x half> undef to <4 x i1>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r112 = fptoui <4 x half> undef to <4 x i8>
@@ -3596,8 +3596,8 @@
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r117 = fptosi <4 x half> undef to <4 x i32>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r118 = fptoui <4 x half> undef to <4 x i64>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r119 = fptosi <4 x half> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r132 = fptoui <8 x half> undef to <8 x i8>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r133 = fptosi <8 x half> undef to <8 x i8>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r134 = fptoui <8 x half> undef to <8 x i16>
diff --git a/llvm/test/Analysis/CostModel/AArch64/cmp.ll b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
--- a/llvm/test/Analysis/CostModel/AArch64/cmp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
@@ -17,7 +17,7 @@
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cf16 = fcmp oge half undef, undef
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cf32 = fcmp ogt float undef, undef
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cf64 = fcmp ogt double undef, undef
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %cfv816 = fcmp olt <8 x half> undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %cfv816 = fcmp olt <8 x half> undef, undef
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cfv432 = fcmp oge <4 x float> undef, undef
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cfv264 = fcmp oge <2 x double> undef, undef
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/AArch64/ctlz.ll b/llvm/test/Analysis/CostModel/AArch64/ctlz.ll
--- a/llvm/test/Analysis/CostModel/AArch64/ctlz.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/ctlz.ll
@@ -55,7 +55,7 @@
 define <2 x i64> @test_ctlz_v2i64(<2 x i64> %a) {
 ;
 ; CHECK-LABEL: 'test_ctlz_v2i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz
 ;
   %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false)
@@ -154,7 +154,7 @@
 
 define <4 x i64> @test_ctlz_v4i64(<4 x i64> %a) {
 ; CHECK-LABEL: 'test_ctlz_v4i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctlz
 ;
   %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false)
diff --git a/llvm/test/Analysis/CostModel/AArch64/cttz.ll b/llvm/test/Analysis/CostModel/AArch64/cttz.ll
--- a/llvm/test/Analysis/CostModel/AArch64/cttz.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cttz.ll
@@ -55,7 +55,7 @@
 define <2 x i64> @test_cttz_v2i64(<2 x i64> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v2i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %cttz
 ;
   %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
@@ -65,7 +65,7 @@
 define <2 x i32> @test_cttz_v2i32(<2 x i32> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v2i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %cttz
 ;
   %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 true)
@@ -75,7 +75,7 @@
 define <4 x i32> @test_cttz_v4i32(<4 x i32> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %cttz
 ;
   %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true)
@@ -85,7 +85,7 @@
 define <2 x i16> @test_cttz_v2i16(<2 x i16> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v2i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cttz = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i16> %cttz
 ;
   %cttz = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 true)
@@ -95,7 +95,7 @@
 define <4 x i16> @test_cttz_v4i16(<4 x i16> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v4i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %cttz = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cttz = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %cttz
 ;
   %cttz = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 true)
@@ -105,7 +105,7 @@
 define <8 x i16> @test_cttz_v8i16(<8 x i16> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v8i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %cttz
 ;
   %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true)
@@ -115,7 +115,7 @@
 define <2 x i8> @test_cttz_v2i8(<2 x i8> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v2i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cttz = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i8> %cttz
 ;
   %cttz = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 true)
@@ -125,7 +125,7 @@
 define <4 x i8> @test_cttz_v4i8(<4 x i8> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v4i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %cttz = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cttz = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %cttz
 ;
   %cttz = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 true)
@@ -135,7 +135,7 @@
 define <8 x i8> @test_cttz_v8i8(<8 x i8> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v8i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %cttz = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cttz = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %cttz
 ;
   %cttz = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 true)
@@ -145,7 +145,7 @@
 define <16 x i8> @test_cttz_v16i8(<16 x i8> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v16i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %cttz
 ;
   %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true)
@@ -155,7 +155,7 @@
 define <4 x i64> @test_cttz_v4i64(<4 x i64> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v4i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %cttz
 ;
   %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 true)
@@ -165,7 +165,7 @@
 define <8 x i32> @test_cttz_v8i32(<8 x i32> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v8i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %cttz
 ;
   %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 true)
@@ -175,7 +175,7 @@
 define <16 x i16> @test_cttz_v16i16(<16 x i16> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v16i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %cttz
 ;
   %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 true)
@@ -185,7 +185,7 @@
 define <32 x i8> @test_cttz_v32i8(<32 x i8> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v32i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %cttz
 ;
   %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 true)
diff --git a/llvm/test/Analysis/CostModel/AArch64/div.ll b/llvm/test/Analysis/CostModel/AArch64/div.ll
--- a/llvm/test/Analysis/CostModel/AArch64/div.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/div.ll
@@ -6,21 +6,21 @@
 define i32 @sdiv() {
 ; CHECK-LABEL: 'sdiv'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = sdiv <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = sdiv <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = sdiv <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = sdiv <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = sdiv <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = sdiv <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = sdiv <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = sdiv <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = sdiv <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = sdiv <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = sdiv <16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = sdiv <32 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = sdiv <64 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = sdiv i64 undef, undef
@@ -49,21 +49,21 @@
 define i32 @udiv() {
 ; CHECK-LABEL: 'udiv'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = udiv <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = udiv <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = udiv <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = udiv <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = udiv <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = udiv <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = udiv <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = udiv <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = udiv <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = udiv <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = udiv <16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = udiv <32 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = udiv <64 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = udiv i64 undef, undef
@@ -92,21 +92,21 @@
 define i32 @sdiv_const() {
 ; CHECK-LABEL: 'sdiv_const'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = sdiv i64 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = sdiv i64 undef, 7
@@ -135,21 +135,21 @@
 define i32 @udiv_const() {
 ; CHECK-LABEL: 'udiv_const'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 6, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = udiv i64 undef, 7
@@ -264,21 +264,21 @@
 define i32 @sdiv_constpow2() {
 ; CHECK-LABEL: 'sdiv_constpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = sdiv i64 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 8, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 8, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = sdiv i32 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = sdiv i16 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = sdiv i8 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = sdiv i64 undef, 16
@@ -307,21 +307,21 @@
 define i32 @udiv_constpow2() {
 ; CHECK-LABEL: 'udiv_constpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 8, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 8, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = udiv i64 undef, 16
@@ -350,21 +350,21 @@
 define i32 @sdiv_uniformconstpow2() {
 ; CHECK-LABEL: 'sdiv_uniformconstpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = sdiv i64 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 16, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 16, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = sdiv i32 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = sdiv i16 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 122 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 244 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = sdiv i8 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 125 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 250 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 500 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = sdiv i64 undef, 16
@@ -436,21 +436,21 @@
 define i32 @sdiv_constnegpow2() {
 ; CHECK-LABEL: 'sdiv_constnegpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = sdiv i64 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 -8, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 -8, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = sdiv i64 undef, -16
@@ -479,21 +479,21 @@
 define i32 @udiv_constnegpow2() {
 ; CHECK-LABEL: 'udiv_constnegpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 -8, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 -8, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = udiv i64 undef, -16
diff --git a/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll b/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll
--- a/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll
@@ -34,7 +34,7 @@
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f64s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f64u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> undef)
@@ -233,46 +233,46 @@
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 206 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 250 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 201 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 366 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 177 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 547 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 413 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 504 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 385 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 286 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 374 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 732 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 354 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 149 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 186 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 186 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 147 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 171 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 325 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 373 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 281 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 373 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 281 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 262 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 342 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 282 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 650 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 286 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-FP16-LABEL: 'fp16'
@@ -292,20 +292,20 @@
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
@@ -314,8 +314,8 @@
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 206 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
@@ -324,8 +324,8 @@
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 400 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/free-widening-casts.ll b/llvm/test/Analysis/CostModel/AArch64/free-widening-casts.ll
--- a/llvm/test/Analysis/CostModel/AArch64/free-widening-casts.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/free-widening-casts.ll
@@ -616,7 +616,7 @@
 }
 
 ; COST-LABEL: neg_llegal_vector_type_3
-; COST-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp0 = zext <3 x i34> %a to <3 x i68>
+; COST-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %tmp0 = zext <3 x i34> %a to <3 x i68>
 define <3 x i68> @neg_llegal_vector_type_3(<3 x i34> %a, <3 x i68> %b) {
   %tmp0 = zext <3 x i34> %a to <3 x i68>
   %tmp1 = add <3 x i68> %b, %tmp0
diff --git a/llvm/test/Analysis/CostModel/AArch64/fshl.ll b/llvm/test/Analysis/CostModel/AArch64/fshl.ll
--- a/llvm/test/Analysis/CostModel/AArch64/fshl.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fshl.ll
@@ -116,7 +116,7 @@
 
 define <16 x i8> @fshl_v16i8_3rd_arg_var(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
 ; CHECK-LABEL: 'fshl_v16i8_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %fshl
 ;
 entry:
@@ -148,7 +148,7 @@
 
 define <8 x i16> @fshl_v8i16_3rd_arg_var(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
 ; CHECK-LABEL: 'fshl_v8i16_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %fshl
 ;
 entry:
@@ -180,7 +180,7 @@
 
 define <4 x i32> @fshl_v4i32_3rd_arg_var(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: 'fshl_v4i32_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %fshl
 ;
 entry:
@@ -212,7 +212,7 @@
 
 define <2 x i64> @fshl_v2i64_3rd_arg_var(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 ; CHECK-LABEL: 'fshl_v2i64_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %fshl
 ;
 entry:
@@ -224,7 +224,7 @@
 
 define <4 x i30> @fshl_v4i30_3rd_arg_var(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c) {
 ; CHECK-LABEL: 'fshl_v4i30_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %fshl = tail call <4 x i30> @llvm.fshl.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %fshl = tail call <4 x i30> @llvm.fshl.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i30> %fshl
 ;
 entry:
@@ -236,7 +236,7 @@
 
 define <2 x i66> @fshl_v2i66_3rd_arg_vec_const_lanes_different(<2 x i66> %a, <2 x i66> %b) {
 ; CHECK-LABEL: 'fshl_v2i66_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %fshl = tail call <2 x i66> @llvm.fshl.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %fshl = tail call <2 x i66> @llvm.fshl.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i66> %fshl
 ;
 entry:
@@ -259,7 +259,7 @@
 
 define <2 x i128> @fshl_v2i128_3rd_arg_vec_const_lanes_different(<2 x i128> %a, <2 x i128> %b) {
 ; CHECK-LABEL: 'fshl_v2i128_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %fshl = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %fshl = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %fshl
 ;
 entry:
diff --git a/llvm/test/Analysis/CostModel/AArch64/fshr.ll b/llvm/test/Analysis/CostModel/AArch64/fshr.ll
--- a/llvm/test/Analysis/CostModel/AArch64/fshr.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fshr.ll
@@ -116,7 +116,7 @@
 
 define <16 x i8> @fshr_v16i8_3rd_arg_var(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
 ; CHECK-LABEL: 'fshr_v16i8_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %fshr
 ;
 entry:
@@ -148,7 +148,7 @@
 
 define <8 x i16> @fshr_v8i16_3rd_arg_var(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
 ; CHECK-LABEL: 'fshr_v8i16_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %fshr
 ;
 entry:
@@ -180,7 +180,7 @@
 
 define <4 x i32> @fshr_v4i32_3rd_arg_var(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: 'fshr_v4i32_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %fshr
 ;
 entry:
@@ -212,7 +212,7 @@
 
 define <2 x i64> @fshr_v2i64_3rd_arg_var(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 ; CHECK-LABEL: 'fshr_v2i64_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fshr = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %fshr = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %fshr
 ;
 entry:
@@ -224,7 +224,7 @@
 
 define <4 x i30> @fshr_v4i30_3rd_arg_var(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c) {
 ; CHECK-LABEL: 'fshr_v4i30_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %fshr = tail call <4 x i30> @llvm.fshr.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %fshr = tail call <4 x i30> @llvm.fshr.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i30> %fshr
 ;
 entry:
@@ -236,7 +236,7 @@
 
 define <2 x i66> @fshr_v2i66_3rd_arg_vec_const_lanes_different(<2 x i66> %a, <2 x i66> %b) {
 ; CHECK-LABEL: 'fshr_v2i66_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %fshr = tail call <2 x i66> @llvm.fshr.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %fshr = tail call <2 x i66> @llvm.fshr.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i66> %fshr
 ;
 entry:
@@ -259,7 +259,7 @@
 
 define <2 x i128> @fshr_v2i128_3rd_arg_vec_const_lanes_different(<2 x i128> %a, <2 x i128> %b) {
 ; CHECK-LABEL: 'fshr_v2i128_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %fshr = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %fshr = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %fshr
 ;
 entry:
diff --git a/llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll b/llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll
--- a/llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll
@@ -21,8 +21,8 @@
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %12 = call <8 x float> @llvm.experimental.vector.reverse.v8f32(<8 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = call <4 x double> @llvm.experimental.vector.reverse.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %15 = call <8 x bfloat> @llvm.experimental.vector.reverse.v8bf16(<8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %16 = call <16 x bfloat> @llvm.experimental.vector.reverse.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %15 = call <8 x bfloat> @llvm.experimental.vector.reverse.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %16 = call <16 x bfloat> @llvm.experimental.vector.reverse.v16bf16(<16 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 
diff --git a/llvm/test/Analysis/CostModel/AArch64/insert-extract.ll b/llvm/test/Analysis/CostModel/AArch64/insert-extract.ll
--- a/llvm/test/Analysis/CostModel/AArch64/insert-extract.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/insert-extract.ll
@@ -11,38 +11,38 @@
 
 define void @vectorInstrCost() {
 ; CHECK-LABEL: 'vectorInstrCost'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ta0 = extractelement <8 x i1> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ta1 = extractelement <8 x i1> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t1 = extractelement <8 x i8> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t2 = extractelement <8 x i8> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t3 = extractelement <4 x i16> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t4 = extractelement <4 x i16> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t5 = extractelement <2 x i32> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t6 = extractelement <2 x i32> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t7 = extractelement <2 x i64> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t8 = extractelement <2 x i64> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ta0 = extractelement <8 x i1> undef, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ta1 = extractelement <8 x i1> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t1 = extractelement <8 x i8> undef, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t2 = extractelement <8 x i8> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t3 = extractelement <4 x i16> undef, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t4 = extractelement <4 x i16> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t5 = extractelement <2 x i32> undef, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t6 = extractelement <2 x i32> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t7 = extractelement <2 x i64> undef, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t8 = extractelement <2 x i64> undef, i32 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %t9 = extractelement <4 x half> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t10 = extractelement <4 x half> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = extractelement <4 x half> undef, i32 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %t11 = extractelement <2 x float> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t12 = extractelement <2 x float> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t12 = extractelement <2 x float> undef, i32 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %t13 = extractelement <2 x double> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t14 = extractelement <2 x double> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t31 = insertelement <8 x i1> undef, i1 false, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t41 = insertelement <8 x i1> undef, i1 true, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t30 = insertelement <8 x i8> undef, i8 0, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t40 = insertelement <8 x i8> undef, i8 1, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t50 = insertelement <4 x i16> undef, i16 2, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t60 = insertelement <4 x i16> undef, i16 3, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t70 = insertelement <2 x i32> undef, i32 4, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t80 = insertelement <2 x i32> undef, i32 5, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t90 = insertelement <2 x i64> undef, i64 6, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t100 = insertelement <2 x i64> undef, i64 7, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t14 = extractelement <2 x double> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t31 = insertelement <8 x i1> undef, i1 false, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t41 = insertelement <8 x i1> undef, i1 true, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t30 = insertelement <8 x i8> undef, i8 0, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t40 = insertelement <8 x i8> undef, i8 1, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t50 = insertelement <4 x i16> undef, i16 2, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t60 = insertelement <4 x i16> undef, i16 3, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t70 = insertelement <2 x i32> undef, i32 4, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t80 = insertelement <2 x i32> undef, i32 5, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t90 = insertelement <2 x i64> undef, i64 6, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t100 = insertelement <2 x i64> undef, i64 7, i32 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %t110 = insertelement <4 x half> zeroinitializer, half 0xH0000, i64 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t120 = insertelement <4 x half> zeroinitializer, half 0xH0000, i64 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t120 = insertelement <4 x half> zeroinitializer, half 0xH0000, i64 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %t130 = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t140 = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t140 = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %t150 = insertelement <2 x double> zeroinitializer, double 0.000000e+00, i64 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t160 = insertelement <2 x double> zeroinitializer, double 0.000000e+00, i64 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t160 = insertelement <2 x double> zeroinitializer, double 0.000000e+00, i64 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; KRYO-LABEL: 'vectorInstrCost'
@@ -122,7 +122,7 @@
 define <8 x i8> @LD1_B(<8 x i8> %vec, ptr noundef %i) {
 ; CHECK-LABEL: 'LD1_B'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i8, ptr %i, align 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %v2
 ;
 ; KRYO-LABEL: 'LD1_B'
@@ -139,7 +139,7 @@
 define <4 x i16> @LD1_H(<4 x i16> %vec, ptr noundef %i) {
 ; CHECK-LABEL: 'LD1_H'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i16, ptr %i, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %v2
 ;
 ; KRYO-LABEL: 'LD1_H'
@@ -156,7 +156,7 @@
 define <4 x i32> @LD1_W(<4 x i32> %vec, ptr noundef %i) {
 ; CHECK-LABEL: 'LD1_W'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i32, ptr %i, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %v2
 ;
 ; KRYO-LABEL: 'LD1_W'
@@ -173,7 +173,7 @@
 define <2 x i64> @LD1_X(<2 x i64> %vec, ptr noundef %i) {
 ; CHECK-LABEL: 'LD1_X'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i64, ptr %i, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %v2
 ;
 ; KRYO-LABEL: 'LD1_X'
diff --git a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
--- a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
@@ -5,24 +5,24 @@
 
 define void @fixed() {
 ; CHECK-LABEL: 'fixed'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 109 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 entry:
diff --git a/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll b/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
--- a/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
@@ -190,11 +190,11 @@
 define <4 x i8> @gather_load_4xi8_constant_mask(<4 x ptr> %ptrs) {
 ; CHECK:         gather_load_4xi8_constant_mask
 ; CHECK-NEON-LABEL: 'gather_load_4xi8_constant_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %lv
 ;
 ; CHECK-SVE-128-LABEL: 'gather_load_4xi8_constant_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %lv
 ;
 ; CHECK-SVE-256-LABEL: 'gather_load_4xi8_constant_mask'
@@ -212,11 +212,11 @@
 define <4 x i8> @gather_load_4xi8_variable_mask(<4 x ptr> %ptrs, <4 x i1> %cond) {
 ; CHECK:         gather_load_4xi8_variable_mask
 ; CHECK-NEON-LABEL: 'gather_load_4xi8_variable_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %lv
 ;
 ; CHECK-SVE-128-LABEL: 'gather_load_4xi8_variable_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %lv
 ;
 ; CHECK-SVE-256-LABEL: 'gather_load_4xi8_variable_mask'
@@ -235,11 +235,11 @@
 define void @scatter_store_4xi8_constant_mask(<4 x i8> %val, <4 x ptr> %ptrs) {
 ; CHECK:         scatter_store_4xi8_constant_mask
 ; CHECK-NEON-LABEL: 'scatter_store_4xi8_constant_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'scatter_store_4xi8_constant_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'scatter_store_4xi8_constant_mask'
@@ -257,11 +257,11 @@
 define void @scatter_store_4xi8_variable_mask(<4 x i8> %val, <4 x ptr> %ptrs, <4 x i1> %cond) {
 ; CHECK:         scatter_store_4xi8_variable_mask
 ; CHECK-NEON-LABEL: 'scatter_store_4xi8_variable_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'scatter_store_4xi8_variable_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'scatter_store_4xi8_variable_mask'
@@ -280,11 +280,11 @@
 define <4 x i32> @gather_load_4xi32_constant_mask(<4 x ptr> %ptrs) {
 ; CHECK:         gather_load_4xi32_constant_mask
 ; CHECK-NEON-LABEL: 'gather_load_4xi32_constant_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %lv
 ;
 ; CHECK-SVE-128-LABEL: 'gather_load_4xi32_constant_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %lv
 ;
 ; CHECK-SVE-256-LABEL: 'gather_load_4xi32_constant_mask'
@@ -302,11 +302,11 @@
 define <4 x i32> @gather_load_4xi32_variable_mask(<4 x ptr> %ptrs, <4 x i1> %cond) {
 ; CHECK:         gather_load_4xi32_variable_mask
 ; CHECK-NEON-LABEL: 'gather_load_4xi32_variable_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %lv
 ;
 ; CHECK-SVE-128-LABEL: 'gather_load_4xi32_variable_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %lv
 ;
 ; CHECK-SVE-256-LABEL: 'gather_load_4xi32_variable_mask'
@@ -325,11 +325,11 @@
 define void @scatter_store_4xi32_constant_mask(<4 x i32> %val, <4 x ptr> %ptrs) {
 ; CHECK:         scatter_store_4xi32_constant_mask
 ; CHECK-NEON-LABEL: 'scatter_store_4xi32_constant_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'scatter_store_4xi32_constant_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'scatter_store_4xi32_constant_mask'
@@ -347,11 +347,11 @@
 define void @scatter_store_4xi32_variable_mask(<4 x i32> %val, <4 x ptr> %ptrs, <4 x i1> %cond) {
 ; CHECK:         scatter_store_4xi32_variable_mask
 ; CHECK-NEON-LABEL: 'scatter_store_4xi32_variable_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'scatter_store_4xi32_variable_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'scatter_store_4xi32_variable_mask'
@@ -370,11 +370,11 @@
 define void @sve_gather_vls(<256 x i1> %v256i1mask) {
 ; CHECK-LABEL: 'sve_scatter_vls'
 ; CHECK-NEON-LABEL: 'sve_gather_vls'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1952 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'sve_gather_vls'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 1952 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'sve_gather_vls'
@@ -394,11 +394,11 @@
 define void @sve_gather_vls_float(<256 x i1> %v256i1mask) {
 ; CHECK-LABEL: 'sve_gather_vls_float'
 ; CHECK-NEON-LABEL: 'sve_gather_vls_float'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1856 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1664 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'sve_gather_vls_float'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 1856 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 1664 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'sve_gather_vls_float'
@@ -418,11 +418,11 @@
 define void @sve_scatter_vls(<256 x i1> %v256i1mask){
 ; CHECK-LABEL: 'sve_scatter_vls'
 ; CHECK-NEON-LABEL: 'sve_scatter_vls'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2000 for instruction: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'sve_scatter_vls'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 2000 for instruction: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'sve_scatter_vls'
@@ -442,11 +442,11 @@
 define void @sve_scatter_vls_float(<512 x i1> %v512i1mask){
 ; CHECK-LABEL: 'sve_scatter_vls_float'
 ; CHECK-NEON-LABEL: 'sve_scatter_vls_float'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3904 for instruction: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3456 for instruction: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'sve_scatter_vls_float'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 3904 for instruction: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 3456 for instruction: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'sve_scatter_vls_float'
diff --git a/llvm/test/Analysis/CostModel/AArch64/min-max.ll b/llvm/test/Analysis/CostModel/AArch64/min-max.ll
--- a/llvm/test/Analysis/CostModel/AArch64/min-max.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/min-max.ll
@@ -195,10 +195,10 @@
 define void @minnum16() {
 ; CHECK-NOF16-LABEL: 'minnum16'
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minnum.f16(half undef, half undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-F16-LABEL: 'minnum16'
@@ -220,10 +220,10 @@
 define void @maxnum16() {
 ; CHECK-NOF16-LABEL: 'maxnum16'
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maxnum.f16(half undef, half undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-F16-LABEL: 'maxnum16'
@@ -288,10 +288,10 @@
 define void @minimum16() {
 ; CHECK-NOF16-LABEL: 'minimum16'
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-F16-LABEL: 'minimum16'
@@ -313,10 +313,10 @@
 define void @maximum16() {
 ; CHECK-NOF16-LABEL: 'maximum16'
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-F16-LABEL: 'maximum16'
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
@@ -7,35 +7,35 @@
 
 define void @strict_fp_reductions() {
 ; CHECK-LABEL: 'strict_fp_reductions'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; FP16-LABEL: 'strict_fp_reductions'
-; FP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; BF16-LABEL: 'strict_fp_reductions'
-; BF16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -58,20 +58,20 @@
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -81,20 +81,20 @@
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -104,20 +104,20 @@
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll
@@ -162,14 +162,14 @@
 
 define void @reduce_fmin16() {
 ; CHECK-NOF16-LABEL: 'reduce_fmin16'
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2f16 = call half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V4f16 = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V8f16 = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 350 for instruction: %V16f16 = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2f16m = call half @llvm.vector.reduce.fminimum.v2f16(<2 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V4f16m = call half @llvm.vector.reduce.fminimum.v4f16(<4 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V8f16m = call half @llvm.vector.reduce.fminimum.v8f16(<8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 350 for instruction: %V16f16m = call half @llvm.vector.reduce.fminimum.v16f16(<16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2f16 = call half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4f16 = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 174 for instruction: %V8f16 = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V16f16 = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2f16m = call half @llvm.vector.reduce.fminimum.v2f16(<2 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4f16m = call half @llvm.vector.reduce.fminimum.v4f16(<4 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 174 for instruction: %V8f16m = call half @llvm.vector.reduce.fminimum.v8f16(<8 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V16f16m = call half @llvm.vector.reduce.fminimum.v16f16(<16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-F16-LABEL: 'reduce_fmin16'
@@ -196,14 +196,14 @@
 
 define void @reduce_fmax16() {
 ; CHECK-NOF16-LABEL: 'reduce_fmax16'
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2f16 = call half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V4f16 = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V8f16 = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 350 for instruction: %V16f16 = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2f16m = call half @llvm.vector.reduce.fmaximum.v2f16(<2 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V4f16m = call half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V8f16m = call half @llvm.vector.reduce.fmaximum.v8f16(<8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 350 for instruction: %V16f16m = call half @llvm.vector.reduce.fmaximum.v16f16(<16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2f16 = call half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4f16 = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 174 for instruction: %V8f16 = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V16f16 = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2f16m = call half @llvm.vector.reduce.fmaximum.v2f16(<2 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4f16m = call half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 174 for instruction: %V8f16m = call half @llvm.vector.reduce.fmaximum.v8f16(<8 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V16f16m = call half @llvm.vector.reduce.fmaximum.v16f16(<16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-F16-LABEL: 'reduce_fmax16'
diff --git a/llvm/test/Analysis/CostModel/AArch64/rem.ll b/llvm/test/Analysis/CostModel/AArch64/rem.ll
--- a/llvm/test/Analysis/CostModel/AArch64/rem.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/rem.ll
@@ -6,21 +6,21 @@
 define i32 @srem() {
 ; CHECK-LABEL: 'srem'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = srem <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = srem <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = srem <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = srem <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = srem <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = srem <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = srem <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = srem <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = srem <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = srem <16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = srem <32 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = srem <64 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = srem i64 undef, undef
@@ -49,21 +49,21 @@
 define i32 @urem() {
 ; CHECK-LABEL: 'urem'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = urem i64 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = urem <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = urem <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = urem <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = urem <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = urem <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = urem <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = urem <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = urem <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = urem <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = urem <16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = urem <32 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = urem <64 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = urem i64 undef, undef
@@ -92,21 +92,21 @@
 define i32 @srem_const() {
 ; CHECK-LABEL: 'srem_const'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = srem i64 undef, 7
@@ -135,21 +135,21 @@
 define i32 @urem_const() {
 ; CHECK-LABEL: 'urem_const'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = urem <2 x i64> undef, <i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, <i64 6, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = urem i64 undef, 7
@@ -178,21 +178,21 @@
 define i32 @srem_uniformconst() {
 ; CHECK-LABEL: 'srem_uniformconst'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = srem <2 x i64> undef, <i64 7, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = srem <4 x i64> undef, <i64 7, i64 7, i64 7, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = srem <8 x i64> undef, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = srem <2 x i64> undef, <i64 7, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = srem <4 x i64> undef, <i64 7, i64 7, i64 7, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = srem <8 x i64> undef, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = srem <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = srem <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = srem <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = srem <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = srem i64 undef, 7
@@ -221,21 +221,21 @@
 define i32 @urem_uniformconst() {
 ; CHECK-LABEL: 'urem_uniformconst'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = urem <2 x i64> undef, <i64 7, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = urem <4 x i64> undef, <i64 7, i64 7, i64 7, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = urem <8 x i64> undef, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = urem <2 x i64> undef, <i64 7, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = urem <4 x i64> undef, <i64 7, i64 7, i64 7, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = urem <8 x i64> undef, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = urem <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = urem <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = urem <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = urem <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = urem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = urem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = urem i64 undef, 7
@@ -264,21 +264,21 @@
 define i32 @srem_constpow2() {
 ; CHECK-LABEL: 'srem_constpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = srem i32 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = srem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = srem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = srem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = srem i16 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = srem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = srem i8 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = srem i64 undef, 16
@@ -307,21 +307,21 @@
 define i32 @urem_constpow2() {
 ; CHECK-LABEL: 'urem_constpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = urem <2 x i64> undef, <i64 8, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = urem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = urem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, <i64 8, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = urem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = urem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = urem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = urem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = urem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = urem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = urem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = urem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = urem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = urem i64 undef, 16
@@ -350,21 +350,21 @@
 define i32 @srem_uniformconstpow2() {
 ; CHECK-LABEL: 'srem_uniformconstpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4i64 = srem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8i64 = srem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4i64 = srem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i64 = srem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = srem i32 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4i32 = srem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8i32 = srem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16i32 = srem <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4i32 = srem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i32 = srem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i32 = srem <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = srem i16 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V8i16 = srem <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = srem <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = srem i8 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 404 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 704 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = srem i64 undef, 16
@@ -393,21 +393,21 @@
 define i32 @urem_uniformconstpow2() {
 ; CHECK-LABEL: 'urem_uniformconstpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = urem <2 x i64> undef, <i64 16, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = urem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = urem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = urem <2 x i64> undef, <i64 16, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = urem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = urem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = urem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = urem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = urem <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = urem <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = urem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = urem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = urem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = urem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = urem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = urem i64 undef, 16
@@ -436,21 +436,21 @@
 define i32 @srem_constnegpow2() {
 ; CHECK-LABEL: 'srem_constnegpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = srem <2 x i64> undef, <i64 -8, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = srem <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = srem <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, <i64 -8, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = srem <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = srem <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = srem <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = srem <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = srem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = srem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = srem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = srem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = srem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = srem i64 undef, -16
@@ -479,21 +479,21 @@
 define i32 @urem_constnegpow2() {
 ; CHECK-LABEL: 'urem_constnegpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = urem <2 x i64> undef, <i64 -8, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = urem <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = urem <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, <i64 -8, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = urem <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = urem <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = urem <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = urem <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = urem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = urem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = urem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = urem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = urem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = urem i64 undef, -16
@@ -522,21 +522,21 @@
 define i32 @srem_uniformconstnegpow2() {
 ; CHECK-LABEL: 'srem_uniformconstnegpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = srem <2 x i64> undef, <i64 -16, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = srem <4 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = srem <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = srem <2 x i64> undef, <i64 -16, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = srem <4 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = srem <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = srem <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = srem <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = srem <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = srem <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = srem <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = srem <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = srem <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = srem <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = srem <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = srem i64 undef, -16
@@ -565,21 +565,21 @@
 define i32 @urem_uniformconstnegpow2() {
 ; CHECK-LABEL: 'urem_uniformconstnegpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = urem <2 x i64> undef, <i64 -16, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = urem <4 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = urem <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = urem <2 x i64> undef, <i64 -16, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = urem <4 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = urem <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = urem <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = urem <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = urem <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = urem <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = urem <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = urem <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = urem <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = urem <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = urem <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = urem i64 undef, -16
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll
@@ -265,13 +265,13 @@
 define <8 x i8> @ld1r_8b_int_shuff(ptr nocapture %x) {
 ; CHECK-LABEL: 'ld1r_8b_int_shuff'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i8, ptr %x, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <8 x i8> undef, i8 %tmp, i8 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <8 x i8> undef, i8 %tmp, i8 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %lane
 ;
 ; CODESIZE-LABEL: 'ld1r_8b_int_shuff'
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i8, ptr %x, align 2
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <8 x i8> undef, i8 %tmp, i8 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <8 x i8> undef, i8 %tmp, i8 0
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %lane
 ;
@@ -285,13 +285,13 @@
 define <16 x i8> @ld1r_16b_int_shuff(ptr nocapture %x) {
 ; CHECK-LABEL: 'ld1r_16b_int_shuff'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i8, ptr %x, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <16 x i8> undef, i8 %tmp, i8 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <16 x i8> undef, i8 %tmp, i8 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %lane
 ;
 ; CODESIZE-LABEL: 'ld1r_16b_int_shuff'
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i8, ptr %x, align 2
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <16 x i8> undef, i8 %tmp, i8 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <16 x i8> undef, i8 %tmp, i8 0
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %lane
 ;
@@ -305,13 +305,13 @@
 define <4 x i16> @ld1r_4h_int_shuff(ptr nocapture %x) {
 ; CHECK-LABEL: 'ld1r_4h_int_shuff'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i16, ptr %x, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <4 x i16> undef, i16 %tmp, i16 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <4 x i16> undef, i16 %tmp, i16 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %lane
 ;
 ; CODESIZE-LABEL: 'ld1r_4h_int_shuff'
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i16, ptr %x, align 2
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <4 x i16> undef, i16 %tmp, i16 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <4 x i16> undef, i16 %tmp, i16 0
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i16> %lane
 ;
@@ -325,13 +325,13 @@
 define <8 x i16> @ld1r_8h_int_shuff(ptr nocapture %x) {
 ; CHECK-LABEL: 'ld1r_8h_int_shuff'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i16, ptr %x, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <8 x i16> undef, i16 %tmp, i16 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <8 x i16> undef, i16 %tmp, i16 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %lane
 ;
 ; CODESIZE-LABEL: 'ld1r_8h_int_shuff'
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i16, ptr %x, align 2
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <8 x i16> undef, i16 %tmp, i16 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <8 x i16> undef, i16 %tmp, i16 0
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %lane
 ;
@@ -345,13 +345,13 @@
 define <2 x i32> @ld1r_2s_int_shuff(ptr nocapture %x) {
 ; CHECK-LABEL: 'ld1r_2s_int_shuff'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i32, ptr %x, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <2 x i32> undef, i32 %tmp, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <2 x i32> undef, i32 %tmp, i32 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %lane
 ;
 ; CODESIZE-LABEL: 'ld1r_2s_int_shuff'
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i32, ptr %x, align 4
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <2 x i32> undef, i32 %tmp, i32 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <2 x i32> undef, i32 %tmp, i32 0
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %lane
 ;
@@ -365,13 +365,13 @@
 define <4 x i32> @ld1r_4s_int_shuff(ptr nocapture %x) {
 ; CHECK-LABEL: 'ld1r_4s_int_shuff'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i32, ptr %x, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <4 x i32> undef, i32 %tmp, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <4 x i32> undef, i32 %tmp, i32 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %lane
 ;
 ; CODESIZE-LABEL: 'ld1r_4s_int_shuff'
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i32, ptr %x, align 4
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <4 x i32> undef, i32 %tmp, i32 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <4 x i32> undef, i32 %tmp, i32 0
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %lane
 ;
@@ -385,13 +385,13 @@
 define <2 x i64> @ld1r_2d_int_shuff(ptr nocapture %x) {
 ; CHECK-LABEL: 'ld1r_2d_int_shuff'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i64, ptr %x, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x i64> %tmp1, <2 x i64> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %lane
 ;
 ; CODESIZE-LABEL: 'ld1r_2d_int_shuff'
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i64, ptr %x, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x i64> %tmp1, <2 x i64> undef, <2 x i32> zeroinitializer
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %lane
 ;
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
@@ -87,42 +87,42 @@
 
 define void @insert_subvec() {
 ; CHECK-LABEL: 'insert_subvec'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v4i8_2_0 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4i8_2_0 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_2_1 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %v8i8_2_0 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v8i8_2_0 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_2_1 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_2_2 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_2_3 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i8_2_05 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %v16i8_4_0 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_2_05 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v16i8_4_0 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_4_1 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_4_2 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_4_3 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v16i8_4_05 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i8_4_05 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_2_0 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_2_1 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %v8i16_2_0 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v8i16_2_0 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2_1 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2_2 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2_3 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i16_2_05 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v16i16_4_0 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_2_05 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v16i16_4_0 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_4_1 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_4_2 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_4_3 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v16i16_4_05 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i16_4_05 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_2_0 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_2_1 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_0 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_1 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_2 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_3 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i32_2_05 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_2_05 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i32_4_0 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_4_1 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_4_2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_4_3 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v16i32_4_05 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_4_05 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v4i8_2_0 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
@@ -169,16 +169,16 @@
 
 define void @multipart() {
 ; CHECK-LABEL: 'multipart'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %v16a = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v16a = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16b = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v16c = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v16d = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v16c = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v16d = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32a = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32a4 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32idrev = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 31, i32 30, i32 29, i32 28>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32many2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 1, i32 4, i32 8, i32 12, i32 17, i32 20, i32 24, i32 28, i32 2, i32 6, i32 11, i32 14, i32 18, i32 22, i32 27, i32 30>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> <i32 2, i32 3, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> <i32 2, i32 3, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64a = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64b = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64ab = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 ; COST-LABEL: sel.v8i8
-; COST:       Found an estimated cost of 42 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; COST:       Found an estimated cost of 28 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
 ; CODE-LABEL: sel.v8i8
 ; CODE:       tbl v0.8b, { v0.16b }, v1.8b
 define <8 x i8> @sel.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
@@ -13,7 +13,7 @@
 }
 
 ; COST-LABEL: sel.v16i8
-; COST:       Found an estimated cost of 90 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; COST:       Found an estimated cost of 60 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
 ; CODE-LABEL: sel.v16i8
 ; CODE:       tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 define <16 x i8> @sel.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
@@ -32,7 +32,7 @@
 }
 
 ; COST-LABEL: sel.v8i16
-; COST:       Found an estimated cost of 42 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; COST:       Found an estimated cost of 28 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
 ; CODE-LABEL: sel.v8i16
 ; CODE:       tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 define <8 x i16> @sel.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll b/llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll
--- a/llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll
@@ -9,11 +9,11 @@
 
 define void @ins_el0() #0 {
 ; CHECK-DEFAULT-LABEL: 'ins_el0'
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vi1 = insertelement <vscale x 16 x i1> zeroinitializer, i1 false, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vi1 = insertelement <vscale x 16 x i1> zeroinitializer, i1 false, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4 = insertelement <vscale x 4 x float> zeroinitializer, float 0.000000e+00, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5 = insertelement <vscale x 2 x double> zeroinitializer, double 0.000000e+00, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -50,13 +50,13 @@
 
 define void @ins_el1() #0 {
 ; CHECK-DEFAULT-LABEL: 'ins_el1'
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vi1 = insertelement <vscale x 16 x i1> zeroinitializer, i1 false, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = insertelement <vscale x 4 x float> zeroinitializer, float 0.000000e+00, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5 = insertelement <vscale x 2 x double> zeroinitializer, double 0.000000e+00, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vi1 = insertelement <vscale x 16 x i1> zeroinitializer, i1 false, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = insertelement <vscale x 4 x float> zeroinitializer, float 0.000000e+00, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5 = insertelement <vscale x 2 x double> zeroinitializer, double 0.000000e+00, i64 1
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-LOW-LABEL: 'ins_el1'
@@ -92,11 +92,11 @@
 
 define void @ext_el0() #0 {
 ; CHECK-DEFAULT-LABEL: 'ext_el0'
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vi1 = extractelement <vscale x 16 x i1> zeroinitializer, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vi1 = extractelement <vscale x 16 x i1> zeroinitializer, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4 = extractelement <vscale x 4 x float> zeroinitializer, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5 = extractelement <vscale x 2 x double> zeroinitializer, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -133,13 +133,13 @@
 
 define void @ext_el1() #0 {
 ; CHECK-DEFAULT-LABEL: 'ext_el1'
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vi1 = extractelement <vscale x 16 x i1> zeroinitializer, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = extractelement <vscale x 4 x float> zeroinitializer, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5 = extractelement <vscale x 2 x double> zeroinitializer, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vi1 = extractelement <vscale x 16 x i1> zeroinitializer, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = extractelement <vscale x 4 x float> zeroinitializer, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5 = extractelement <vscale x 2 x double> zeroinitializer, i64 1
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-LOW-LABEL: 'ext_el1'
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -6,8 +6,8 @@
 
 define void @vector_insert_extract(<vscale x 4 x i32> %v0, <vscale x 16 x i32> %v1, <16 x i32> %v2) {
 ; CHECK-LABEL: 'vector_insert_extract'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extract_scalable_from_scalable = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %v1, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert_scalable_into_scalable = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v1, <vscale x 4 x i32> %v0, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -618,7 +618,7 @@
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -633,16 +633,16 @@
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
@@ -746,11 +746,11 @@
 
 define <4 x i32> @masked_gather_v4i32(<4 x ptr> %ld, <4 x i1> %masks, <4 x i32> %passthru) {
 ; CHECK-LABEL: 'masked_gather_v4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_gather_v4i32'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
   %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
@@ -759,7 +759,7 @@
 
 define <1 x i128> @masked_gather_v1i128(<1 x ptr> %ld, <1 x i1> %masks, <1 x i128> %passthru) {
 ; CHECK-LABEL: 'masked_gather_v1i128'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> %ld, i32 0, <1 x i1> %masks, <1 x i128> %passthru)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> %ld, i32 0, <1 x i1> %masks, <1 x i128> %passthru)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <1 x i128> %res
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_gather_v1i128'
@@ -800,11 +800,11 @@
 
 define void @masked_scatter_v4i32(<4 x i32> %data, <4 x ptr> %ptrs, <4 x i1> %masks) {
 ; CHECK-LABEL: 'masked_scatter_v4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %ptrs, i32 0, <4 x i1> %masks)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %ptrs, i32 0, <4 x i1> %masks)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_scatter_v4i32'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %ptrs, i32 0, <4 x i1> %masks)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %ptrs, i32 0, <4 x i1> %masks)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 
@@ -814,7 +814,7 @@
 
 define void @masked_scatter_v1i128(<1 x i128> %data, <1 x ptr> %ptrs, <1 x i1> %masks) {
 ; CHECK-LABEL: 'masked_scatter_v1i128'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> %ptrs, i32 0, <1 x i1> %masks)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> %ptrs, i32 0, <1 x i1> %masks)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_scatter_v1i128'
diff --git a/llvm/test/Analysis/CostModel/AArch64/vector-select.ll b/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
--- a/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
@@ -140,7 +140,7 @@
 }
 
 ; COST-LABEL: v2i64_select_no_cmp
-; COST-NEXT:  Cost Model: Found an estimated cost of 5 for instruction:   %s.1 = select <2 x i1> %cond, <2 x i64> %a, <2 x i64> %b
+; COST-NEXT:  Cost Model: Found an estimated cost of 6 for instruction:   %s.1 = select <2 x i1> %cond, <2 x i64> %a, <2 x i64> %b
 
 ; CODE-LABEL: v2i64_select_no_cmp
 ; CODE:       bb.0
@@ -175,7 +175,7 @@
 
 define <8 x half> @v8f16_select_ogt(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-LABEL: v8f16_select_ogt
-; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 29 for instruction:   %cmp.1 = fcmp ogt <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction:   %cmp.1 = fcmp ogt <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %cmp.1 = fcmp ogt <8 x half> %a, %b
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
@@ -259,7 +259,7 @@
 
 define <8 x half> @v8f16_select_oge(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-LABEL: v8f16_select_oge
-; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 29 for instruction:   %cmp.1 = fcmp oge <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction:   %cmp.1 = fcmp oge <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %cmp.1 = fcmp oge <8 x half> %a, %b
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
@@ -343,7 +343,7 @@
 
 define <8 x half> @v8f16_select_olt(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-LABEL: v8f16_select_olt
-; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 29 for instruction:   %cmp.1 = fcmp olt <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction:   %cmp.1 = fcmp olt <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %cmp.1 = fcmp olt <8 x half> %a, %b
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
@@ -427,7 +427,7 @@
 
 define <8 x half> @v8f16_select_ole(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-LABEL: v8f16_select_ole
-; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 29 for instruction:   %cmp.1 = fcmp ole <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction:   %cmp.1 = fcmp ole <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %cmp.1 = fcmp ole <8 x half> %a, %b
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
@@ -511,7 +511,7 @@
 
 define <8 x half> @v8f16_select_oeq(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-LABEL: v8f16_select_oeq
-; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 29 for instruction:   %cmp.1 = fcmp oeq <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction:   %cmp.1 = fcmp oeq <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %cmp.1 = fcmp oeq <8 x half> %a, %b
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
@@ -597,7 +597,7 @@
 
 define <8 x half> @v8f16_select_one(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-LABEL: v8f16_select_one
-; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 29 for instruction:  %cmp.1 = fcmp one <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction:  %cmp.1 = fcmp one <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:  %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction:  %cmp.1 = fcmp one <8 x half> %a, %b
 ; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
@@ -689,7 +689,7 @@
 
 define <8 x half> @v8f16_select_une(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-LABEL: v8f16_select_une
-; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 29 for instruction:   %cmp.1 = fcmp une <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction:   %cmp.1 = fcmp une <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %cmp.1 = fcmp une <8 x half> %a, %b
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -13,7 +13,7 @@
 ; %var4 a lower scalarization overhead.
 ;
 ; COST-LABEL:  predicated_udiv_scalarized_operand
-; COST:        LV: Found an estimated cost of 4 for VF 2 For instruction: %var4 = udiv i64 %var2, %var3
+; COST:        LV: Found an estimated cost of 5 for VF 2 For instruction: %var4 = udiv i64 %var2, %var3
 ;
 ;
 define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) optsize {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
@@ -7,7 +7,7 @@
 %pair = type { i8, i8 }
 
 ; CHECK-LABEL: test
-; CHECK: Found an estimated cost of 14 for VF 2 For instruction:   {{.*}} load i8
+; CHECK: Found an estimated cost of 16 for VF 2 For instruction:   {{.*}} load i8
 ; CHECK: Found an estimated cost of 0 for VF 2 For instruction:   {{.*}} load i8
 ; CHECK-LABEL: entry:
 ; CHECK-LABEL: vector.body:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
@@ -168,10 +168,10 @@
 ; gaps.
 ;
 ; VF_2-LABEL: Checking a loop in 'i64_factor_8'
-; VF_2:         Found an estimated cost of 10 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
+; VF_2:         Found an estimated cost of 16 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_2-NEXT:    Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_2-NEXT:    Found an estimated cost of 7 for VF 2 For instruction: store i64 0, ptr %tmp0, align 8
-; VF_2-NEXT:    Found an estimated cost of 7 for VF 2 For instruction: store i64 0, ptr %tmp1, align 8
+; VF_2-NEXT:    Found an estimated cost of 8 for VF 2 For instruction: store i64 0, ptr %tmp0, align 8
+; VF_2-NEXT:    Found an estimated cost of 8 for VF 2 For instruction: store i64 0, ptr %tmp1, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.8, ptr %data, i64 %i, i32 2
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
@@ -5,8 +5,8 @@
 target triple = "aarch64-unknown-linux-gnu"
 
 ; CHECK-COST: Checking a loop in 'fixed_width'
-; CHECK-COST: Found an estimated cost of 11 for VF 2 For instruction:   store i32 2, ptr %arrayidx1, align 4
-; CHECK-COST: Found an estimated cost of 25 for VF 4 For instruction:   store i32 2, ptr %arrayidx1, align 4
+; CHECK-COST: Found an estimated cost of 12 for VF 2 For instruction:   store i32 2, ptr %arrayidx1, align 4
+; CHECK-COST: Found an estimated cost of 24 for VF 4 For instruction:   store i32 2, ptr %arrayidx1, align 4
 ; CHECK-COST: Selecting VF: 1.
 
 ; We should decide this loop is not worth vectorising using fixed width vectors
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
@@ -16,10 +16,10 @@
 ; as:
 ;
 ; Cost of udiv:
-;   (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
+;   (udiv(2) + extractelement(8) + insertelement(4)) / 2 = 7
 ;
 ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
-; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
+; CHECK: Found an estimated cost of 7 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
 ;
 define i32 @predicated_udiv(ptr %a, ptr %b, i1 %c, i64 %n) {
 entry:
@@ -57,10 +57,10 @@
 ; as:
 ;
 ; Cost of store:
-;   (store(4) + extractelement(3)) / 2 = 3
+;   (store(4) + extractelement(4)) / 2 = 4
 ;
 ; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %tmp0, align 4
-; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
+; CHECK: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
 ;
 define void @predicated_store(ptr %a, i1 %c, i32 %x, i64 %n) {
 entry:
@@ -94,7 +94,7 @@
 ; CHECK: Found scalar instruction:   %addr.next = getelementptr inbounds i32, ptr %addr, i64 1
 ; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %addr, align 4
 ; CHECK: Found an estimated cost of 0 for VF 2 For instruction:   %addr = phi ptr [ %a, %entry ], [ %addr.next, %for.inc ]
-; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, ptr %addr, align 4
+; CHECK: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %addr, align 4
 ;
 define void @predicated_store_phi(ptr %a, i1 %c, i32 %x, i64 %n) {
 entry:
@@ -129,14 +129,14 @@
 ; compute the cost as:
 ;
 ; Cost of add:
-;   (add(2) + extractelement(3)) / 2 = 2
+;   (add(2) + extractelement(4)) / 2 = 3
 ; Cost of udiv:
-;   (udiv(2) + extractelement(3) + insertelement(3)) / 2 = 4
+;   (udiv(2) + extractelement(4) + insertelement(4)) / 2 = 5
 ;
 ; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x
 ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
-; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x
-; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
+; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x
+; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
 ;
 define i32 @predicated_udiv_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) {
 entry:
@@ -174,13 +174,13 @@
 ; compute the cost as:
 ;
 ; Cost of add:
-;   (add(2) + extractelement(3)) / 2 = 2
+;   (add(2) + extractelement(4)) / 2 = 3
 ; Cost of store:
 ;   store(4) / 2 = 2
 ;
 ; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x
 ; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %tmp0, align 4
-; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x
+; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x
 ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
 ;
 define void @predicated_store_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) {
@@ -219,11 +219,11 @@
 ; Cost of add:
 ;   add(1) = 1
 ; Cost of sdiv:
-;   (sdiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
+;   (sdiv(2) + extractelement(8) + insertelement(4)) / 2 = 7
 ; Cost of udiv:
-;   (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
+;   (udiv(2) + extractelement(8) + insertelement(4)) / 2 = 7
 ; Cost of sub:
-;   (sub(2) + extractelement(3)) / 2 = 2
+;   (sub(2) + extractelement(4)) / 2 = 3
 ; Cost of store:
 ;   store(4) / 2 = 2
 ;
@@ -233,9 +233,9 @@
 ; CHECK:     Scalarizing: %tmp5 = sub i32 %tmp4, %x
 ; CHECK:     Scalarizing and predicating: store i32 %tmp5, ptr %tmp0, align 4
 ; CHECK:     Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x
-; CHECK:     Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2
-; CHECK:     Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2
-; CHECK:     Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x
+; CHECK:     Found an estimated cost of 7 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2
+; CHECK:     Found an estimated cost of 7 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2
+; CHECK:     Found an estimated cost of 3 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x
 ; CHECK:     Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, ptr %tmp0, align 4
 ;
 define void @predication_multi_context(ptr %a, i1 %c, i32 %x, i64 %n) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll
@@ -6,8 +6,8 @@
 
 target triple="aarch64-unknown-linux-gnu"
 
-; CHECK-VF4: Found an estimated cost of 17 for VF 4 For instruction:   %add = fadd float %0, %sum.07
-; CHECK-VF8: Found an estimated cost of 34 for VF 8 For instruction:   %add = fadd float %0, %sum.07
+; CHECK-VF4: Found an estimated cost of 14 for VF 4 For instruction:   %add = fadd float %0, %sum.07
+; CHECK-VF8: Found an estimated cost of 28 for VF 8 For instruction:   %add = fadd float %0, %sum.07
 
 define float @fadd_strict32(ptr noalias nocapture readonly %a, i64 %n) {
 entry:
@@ -28,8 +28,8 @@
 }
 
 
-; CHECK-VF4: Found an estimated cost of 14 for VF 4 For instruction:   %add = fadd double %0, %sum.07
-; CHECK-VF8: Found an estimated cost of 28 for VF 8 For instruction:   %add = fadd double %0, %sum.07
+; CHECK-VF4: Found an estimated cost of 12 for VF 4 For instruction:   %add = fadd double %0, %sum.07
+; CHECK-VF8: Found an estimated cost of 24 for VF 8 For instruction:   %add = fadd double %0, %sum.07
 
 define double @fadd_strict64(ptr noalias nocapture readonly %a, i64 %n) {
 entry:
@@ -49,8 +49,8 @@
   ret double %add
 }
 
-; CHECK-VF4: Found an estimated cost of 19 for VF 4 For instruction:   %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
-; CHECK-VF8: Found an estimated cost of 38 for VF 8 For instruction:   %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
+; CHECK-VF4: Found an estimated cost of 16 for VF 4 For instruction:   %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
+; CHECK-VF8: Found an estimated cost of 32 for VF 8 For instruction:   %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
 
 define float @fmuladd_strict32(ptr %a, ptr %b, i64 %n) {
 entry:
@@ -74,8 +74,8 @@
 
 declare float @llvm.fmuladd.f32(float, float, float)
 
-; CHECK-VF4: Found an estimated cost of 18 for VF 4 For instruction:   %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)
-; CHECK-VF8: Found an estimated cost of 36 for VF 8 For instruction:   %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)
+; CHECK-VF4: Found an estimated cost of 16 for VF 4 For instruction:   %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)
+; CHECK-VF8: Found an estimated cost of 32 for VF 8 For instruction:   %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)
 
 define double @fmuladd_strict64(ptr %a, ptr %b, i64 %n) {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll b/llvm/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll
@@ -1,14 +1,13 @@
 ; REQUIRES: asserts
 ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-iphoneos -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
 
-; Specify a large unsafe vectorization factor of 32 that gets clamped to 16,
-; then test an even smaller VF of 2 is selected based on the cost-model.
+; Specify a large unsafe vectorization factor of 32 that gets clamped to 16.
 
 ; CHECK: LV: User VF=32 is unsafe, clamping to max safe VF=16.
 ; CHECK: remark: <unknown>:0:0: User-specified vectorization factor 32 is unsafe, clamping to maximum safe vectorization factor 16
-; CHECK: LV: Selecting VF: 2.
+; CHECK: LV: Selecting VF: 16.
 ; CHECK-LABEL: @test
-; CHECK: <2 x i64>
+; CHECK: <16 x i64>
 define void @test(ptr nocapture %a, ptr nocapture readonly %b) {
 entry:
   br label %loop.header
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-float.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-float.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-float.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-float.ll
@@ -5,46 +5,10 @@
 define <1 x float> @dotproduct_float_v6(<6 x float> %a, <6 x float> %b) {
 ; CHECK-LABEL: @dotproduct_float_v6(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <6 x float> [[A:%.*]], <6 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <6 x float> [[A]], <6 x float> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <6 x float> [[A]], <6 x float> poison, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <6 x float> [[A]], <6 x float> poison, <1 x i32> <i32 3>
-; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <6 x float> [[A]], <6 x float> poison, <1 x i32> <i32 4>
-; CHECK-NEXT:    [[SPLIT5:%.*]] = shufflevector <6 x float> [[A]], <6 x float> poison, <1 x i32> <i32 5>
-; CHECK-NEXT:    [[SPLIT6:%.*]] = shufflevector <6 x float> [[B:%.*]], <6 x float> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <1 x float> [[SPLIT]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x float> poison, float [[TMP0]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <1 x float> [[BLOCK]], [[SPLAT_SPLAT]]
-; CHECK-NEXT:    [[BLOCK7:%.*]] = shufflevector <1 x float> [[SPLIT1]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x float> poison, float [[TMP2]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT8]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK7]], <1 x float> [[SPLAT_SPLAT9]], <1 x float> [[TMP1]])
-; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <1 x float> [[SPLIT2]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 2
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x float> poison, float [[TMP4]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT11]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK10]], <1 x float> [[SPLAT_SPLAT12]], <1 x float> [[TMP3]])
-; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <1 x float> [[SPLIT3]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 3
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x float> poison, float [[TMP6]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT14]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK13]], <1 x float> [[SPLAT_SPLAT15]], <1 x float> [[TMP5]])
-; CHECK-NEXT:    [[BLOCK16:%.*]] = shufflevector <1 x float> [[SPLIT4]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 4
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x float> poison, float [[TMP8]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT17]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK16]], <1 x float> [[SPLAT_SPLAT18]], <1 x float> [[TMP7]])
-; CHECK-NEXT:    [[BLOCK19:%.*]] = shufflevector <1 x float> [[SPLIT5]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 5
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x float> poison, float [[TMP10]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT20]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK19]], <1 x float> [[SPLAT_SPLAT21]], <1 x float> [[TMP9]])
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x float> [[TMP11]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <1 x float> poison, <1 x float> [[TMP12]], <1 x i32> <i32 1>
-; CHECK-NEXT:    ret <1 x float> [[TMP13]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fmul <6 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v6f32(float 0.000000e+00, <6 x float> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <1 x float> poison, float [[TMP1]], i64 0
+; CHECK-NEXT:    ret <1 x float> [[TMP2]]
 ;
 entry:
   %c = tail call fast <1 x float> @llvm.matrix.multiply.v1f32.v6f32.v6f32(<6 x float> %a, <6 x float> %b, i32 1, i32 6, i32 1)
@@ -175,51 +139,12 @@
 define <1 x double> @intrinsic_column_major_load_dot_product_double_v6(ptr %lhs_address, ptr %rhs_address) {
 ; CHECK-LABEL: @intrinsic_column_major_load_dot_product_double_v6(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <1 x double>, ptr [[LHS_ADDRESS:%.*]], align 4
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[LHS_ADDRESS]], i64 1
-; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <1 x double>, ptr [[VEC_GEP]], align 4
-; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[LHS_ADDRESS]], i64 2
-; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <1 x double>, ptr [[VEC_GEP2]], align 4
-; CHECK-NEXT:    [[VEC_GEP4:%.*]] = getelementptr double, ptr [[LHS_ADDRESS]], i64 3
-; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <1 x double>, ptr [[VEC_GEP4]], align 4
-; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr double, ptr [[LHS_ADDRESS]], i64 4
-; CHECK-NEXT:    [[COL_LOAD7:%.*]] = load <1 x double>, ptr [[VEC_GEP6]], align 4
-; CHECK-NEXT:    [[VEC_GEP8:%.*]] = getelementptr double, ptr [[LHS_ADDRESS]], i64 5
-; CHECK-NEXT:    [[COL_LOAD9:%.*]] = load <1 x double>, ptr [[VEC_GEP8]], align 4
-; CHECK-NEXT:    [[COL_LOAD10:%.*]] = load <6 x double>, ptr [[RHS_ADDRESS:%.*]], align 4
-; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <1 x double> [[COL_LOAD]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP0]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
-; CHECK-NEXT:    [[BLOCK11:%.*]] = shufflevector <1 x double> [[COL_LOAD1]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT12:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT13:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT12]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK11]], <1 x double> [[SPLAT_SPLAT13]], <1 x double> [[TMP1]])
-; CHECK-NEXT:    [[BLOCK14:%.*]] = shufflevector <1 x double> [[COL_LOAD3]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 2
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT15]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK14]], <1 x double> [[SPLAT_SPLAT16]], <1 x double> [[TMP3]])
-; CHECK-NEXT:    [[BLOCK17:%.*]] = shufflevector <1 x double> [[COL_LOAD5]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 3
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x double> poison, double [[TMP6]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT18]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK17]], <1 x double> [[SPLAT_SPLAT19]], <1 x double> [[TMP5]])
-; CHECK-NEXT:    [[BLOCK20:%.*]] = shufflevector <1 x double> [[COL_LOAD7]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 4
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT21:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT22:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT21]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK20]], <1 x double> [[SPLAT_SPLAT22]], <1 x double> [[TMP7]])
-; CHECK-NEXT:    [[BLOCK23:%.*]] = shufflevector <1 x double> [[COL_LOAD9]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 5
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT24:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT25:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT24]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK23]], <1 x double> [[SPLAT_SPLAT25]], <1 x double> [[TMP9]])
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <1 x double> poison, <1 x double> [[TMP12]], <1 x i32> <i32 1>
-; CHECK-NEXT:    ret <1 x double> [[TMP13]]
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <6 x double>, ptr [[RHS_ADDRESS:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <6 x double>, ptr [[LHS_ADDRESS:%.*]], align 64
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <6 x double> [[TMP0]], [[COL_LOAD]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fadd.v6f64(double 0.000000e+00, <6 x double> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
+; CHECK-NEXT:    ret <1 x double> [[TMP3]]
 ;
 entry:
   %lhs = tail call fast <6 x double> @llvm.matrix.column.major.load.v6f64.i64(ptr nonnull align 4 %lhs_address, i64 1, i1 false, i32 1, i32 6)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
@@ -12,18 +12,21 @@
 ; CHECK-NEXT:    [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]
-; CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64>
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0
+; CHECK-NEXT:    [[S0:%.*]] = sext i32 [[E0]] to i64
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[S0]]
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[SUB0]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <2 x i32> [[TMP0]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP4]]
+; CHECK-NEXT:    [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
+; CHECK-NEXT:    [[S3:%.*]] = sext i32 [[E3]] to i64
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[S3]]
 ; CHECK-NEXT:    [[LOAD3:%.*]] = load i64, ptr [[GEP3]], align 4
 ; CHECK-NEXT:    call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
 ; CHECK-NEXT:    ret void
@@ -58,23 +61,25 @@
 ; CHECK-NEXT:    [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]
-; CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64>
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[C0:%.*]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[C1:%.*]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[C2:%.*]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[C3:%.*]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP0]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
-; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0
+; CHECK-NEXT:    [[S0:%.*]] = sext i32 [[E0]] to i64
+; CHECK-NEXT:    [[A0:%.*]] = add i64 [[S0]], [[C0:%.*]]
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[A0]]
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP7]]
+; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
+; CHECK-NEXT:    [[S1:%.*]] = sext i32 [[E1]] to i64
+; CHECK-NEXT:    [[A1:%.*]] = add i64 [[S1]], [[C1:%.*]]
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[A1]]
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP8]]
+; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
+; CHECK-NEXT:    [[S2:%.*]] = sext i32 [[E2]] to i64
+; CHECK-NEXT:    [[A2:%.*]] = add i64 [[S2]], [[C2:%.*]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[A2]]
 ; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP9]]
+; CHECK-NEXT:    [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
+; CHECK-NEXT:    [[S3:%.*]] = sext i32 [[E3]] to i64
+; CHECK-NEXT:    [[A3:%.*]] = add i64 [[S3]], [[C3:%.*]]
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[A3]]
 ; CHECK-NEXT:    [[LOAD3:%.*]] = load i64, ptr [[GEP3]], align 4
 ; CHECK-NEXT:    call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
@@ -10,7 +10,7 @@
 ; REMARK-LABEL: Function: gather_multiple_use
 ; REMARK:       Args:
 ; REMARK-NEXT:    - String: 'Vectorized horizontal reduction with cost '
-; REMARK-NEXT:    - Cost: '-7'
+; REMARK-NEXT:    - Cost: '-8'
 ;
 ; REMARK-NOT: Function: gather_load
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@@ -30,7 +30,7 @@
 ; YAML-NEXT: Function:        getelementptr_4x32
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '6'
+; YAML-NEXT:   - Cost:            '4'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '3'
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/landing_pad.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/landing_pad.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/landing_pad.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/landing_pad.ll
@@ -8,7 +8,7 @@
 ; YAML-NEXT: Function:        foo
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '3'
+; YAML-NEXT:   - Cost:            '2'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '2'
 
@@ -28,7 +28,7 @@
 ; YAML-NEXT: Function:        foo
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '1'
+; YAML-NEXT:   - Cost:            '2'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '9'
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
@@ -11,66 +11,46 @@
 define void @wrap_mul4(ptr nocapture %Out, ptr nocapture readonly %A, ptr nocapture readonly %B) {
 ; CHECK-LABEL: @wrap_mul4(
 ; CHECK-NEXT:    [[TEMP:%.*]] = load double, ptr [[A:%.*]], align 8
-; CHECK-NEXT:    [[TEMP1:%.*]] = load double, ptr [[B:%.*]], align 8
-; CHECK-NEXT:    [[MUL_I:%.*]] = fmul double [[TEMP]], [[TEMP1]]
 ; CHECK-NEXT:    [[ARRAYIDX5_I:%.*]] = getelementptr inbounds [2 x double], ptr [[A]], i64 0, i64 1
 ; CHECK-NEXT:    [[TEMP2:%.*]] = load double, ptr [[ARRAYIDX5_I]], align 8
-; CHECK-NEXT:    [[ARRAYIDX7_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 1, i64 0
-; CHECK-NEXT:    [[TEMP3:%.*]] = load double, ptr [[ARRAYIDX7_I]], align 8
-; CHECK-NEXT:    [[MUL8_I:%.*]] = fmul double [[TEMP2]], [[TEMP3]]
-; CHECK-NEXT:    [[ADD_I:%.*]] = fadd double [[MUL_I]], [[MUL8_I]]
-; CHECK-NEXT:    [[ARRAYIDX13_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 0, i64 1
-; CHECK-NEXT:    [[TEMP4:%.*]] = load double, ptr [[ARRAYIDX13_I]], align 8
-; CHECK-NEXT:    [[MUL14_I:%.*]] = fmul double [[TEMP]], [[TEMP4]]
-; CHECK-NEXT:    [[ARRAYIDX18_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 1, i64 1
-; CHECK-NEXT:    [[TEMP5:%.*]] = load double, ptr [[ARRAYIDX18_I]], align 8
-; CHECK-NEXT:    [[MUL19_I:%.*]] = fmul double [[TEMP2]], [[TEMP5]]
-; CHECK-NEXT:    [[ADD20_I:%.*]] = fadd double [[MUL14_I]], [[MUL19_I]]
+; CHECK-NEXT:    [[ARRAYIDX7_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B:%.*]], i64 1, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX25_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 0, i64 2
-; CHECK-NEXT:    [[TEMP6:%.*]] = load double, ptr [[ARRAYIDX25_I]], align 8
-; CHECK-NEXT:    [[MUL26_I:%.*]] = fmul double [[TEMP]], [[TEMP6]]
 ; CHECK-NEXT:    [[ARRAYIDX30_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 1, i64 2
-; CHECK-NEXT:    [[TEMP7:%.*]] = load double, ptr [[ARRAYIDX30_I]], align 8
-; CHECK-NEXT:    [[MUL31_I:%.*]] = fmul double [[TEMP2]], [[TEMP7]]
-; CHECK-NEXT:    [[ADD32_I:%.*]] = fadd double [[MUL26_I]], [[MUL31_I]]
-; CHECK-NEXT:    [[ARRAYIDX37_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 0, i64 3
-; CHECK-NEXT:    [[TEMP8:%.*]] = load double, ptr [[ARRAYIDX37_I]], align 8
-; CHECK-NEXT:    [[MUL38_I:%.*]] = fmul double [[TEMP]], [[TEMP8]]
-; CHECK-NEXT:    [[ARRAYIDX42_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 1, i64 3
-; CHECK-NEXT:    [[TEMP9:%.*]] = load double, ptr [[ARRAYIDX42_I]], align 8
-; CHECK-NEXT:    [[MUL43_I:%.*]] = fmul double [[TEMP2]], [[TEMP9]]
-; CHECK-NEXT:    [[ADD44_I:%.*]] = fadd double [[MUL38_I]], [[MUL43_I]]
 ; CHECK-NEXT:    [[ARRAYIDX47_I:%.*]] = getelementptr inbounds [2 x double], ptr [[A]], i64 1, i64 0
 ; CHECK-NEXT:    [[TEMP10:%.*]] = load double, ptr [[ARRAYIDX47_I]], align 8
-; CHECK-NEXT:    [[MUL50_I:%.*]] = fmul double [[TEMP1]], [[TEMP10]]
 ; CHECK-NEXT:    [[ARRAYIDX52_I:%.*]] = getelementptr inbounds [2 x double], ptr [[A]], i64 1, i64 1
 ; CHECK-NEXT:    [[TEMP11:%.*]] = load double, ptr [[ARRAYIDX52_I]], align 8
-; CHECK-NEXT:    [[MUL55_I:%.*]] = fmul double [[TEMP3]], [[TEMP11]]
-; CHECK-NEXT:    [[ADD56_I:%.*]] = fadd double [[MUL50_I]], [[MUL55_I]]
-; CHECK-NEXT:    [[MUL62_I:%.*]] = fmul double [[TEMP4]], [[TEMP10]]
-; CHECK-NEXT:    [[MUL67_I:%.*]] = fmul double [[TEMP5]], [[TEMP11]]
-; CHECK-NEXT:    [[ADD68_I:%.*]] = fadd double [[MUL62_I]], [[MUL67_I]]
-; CHECK-NEXT:    [[MUL74_I:%.*]] = fmul double [[TEMP6]], [[TEMP10]]
-; CHECK-NEXT:    [[MUL79_I:%.*]] = fmul double [[TEMP7]], [[TEMP11]]
-; CHECK-NEXT:    [[ADD80_I:%.*]] = fadd double [[MUL74_I]], [[MUL79_I]]
-; CHECK-NEXT:    [[MUL86_I:%.*]] = fmul double [[TEMP8]], [[TEMP10]]
-; CHECK-NEXT:    [[MUL91_I:%.*]] = fmul double [[TEMP9]], [[TEMP11]]
-; CHECK-NEXT:    [[ADD92_I:%.*]] = fadd double [[MUL86_I]], [[MUL91_I]]
-; CHECK-NEXT:    store double [[ADD_I]], ptr [[OUT:%.*]], align 8
-; CHECK-NEXT:    [[RES_I_SROA_4_0_OUT2_I_SROA_IDX2:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 1
-; CHECK-NEXT:    store double [[ADD20_I]], ptr [[RES_I_SROA_4_0_OUT2_I_SROA_IDX2]], align 8
-; CHECK-NEXT:    [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 2
-; CHECK-NEXT:    store double [[ADD32_I]], ptr [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]], align 8
-; CHECK-NEXT:    [[RES_I_SROA_6_0_OUT2_I_SROA_IDX6:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 3
-; CHECK-NEXT:    store double [[ADD44_I]], ptr [[RES_I_SROA_6_0_OUT2_I_SROA_IDX6]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TEMP]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX7_I]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TEMP2]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.*]] = getelementptr inbounds double, ptr [[OUT:%.*]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x double>, ptr [[ARRAYIDX25_I]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x double> [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x double>, ptr [[ARRAYIDX30_I]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul <2 x double> [[TMP7]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fadd <2 x double> [[TMP11]], [[TMP13]]
+; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[OUT]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP14]], ptr [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]], align 8
 ; CHECK-NEXT:    [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 4
-; CHECK-NEXT:    store double [[ADD56_I]], ptr [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]], align 8
-; CHECK-NEXT:    [[RES_I_SROA_8_0_OUT2_I_SROA_IDX10:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 5
-; CHECK-NEXT:    store double [[ADD68_I]], ptr [[RES_I_SROA_8_0_OUT2_I_SROA_IDX10]], align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> poison, double [[TEMP10]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul <2 x double> [[TMP1]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x double> poison, double [[TEMP11]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> [[TMP18]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = fmul <2 x double> [[TMP5]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = fadd <2 x double> [[TMP17]], [[TMP20]]
+; CHECK-NEXT:    store <2 x double> [[TMP21]], ptr [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]], align 8
 ; CHECK-NEXT:    [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 6
-; CHECK-NEXT:    store double [[ADD80_I]], ptr [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]], align 8
-; CHECK-NEXT:    [[RES_I_SROA_10_0_OUT2_I_SROA_IDX14:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 7
-; CHECK-NEXT:    store double [[ADD92_I]], ptr [[RES_I_SROA_10_0_OUT2_I_SROA_IDX14]], align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = fmul <2 x double> [[TMP10]], [[TMP16]]
+; CHECK-NEXT:    [[TMP23:%.*]] = fmul <2 x double> [[TMP12]], [[TMP19]]
+; CHECK-NEXT:    [[TMP24:%.*]] = fadd <2 x double> [[TMP22]], [[TMP23]]
+; CHECK-NEXT:    store <2 x double> [[TMP24]], ptr [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %temp = load double, ptr %A, align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
@@ -654,18 +654,17 @@
 define void @single_membound(ptr %arg, ptr %arg1, double %x) {
 ; CHECK-LABEL: @single_membound(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP:%.*]] = fsub double [[X:%.*]], 9.900000e+01
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1
+; CHECK-NEXT:    [[TMP:%.*]] = fsub double [[X:%.*]], 9.900000e+01
 ; CHECK-NEXT:    store double [[TMP]], ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr [[ARG1:%.*]], align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = fsub double 1.000000e+00, [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 2
 ; CHECK-NEXT:    br label [[BB15:%.*]]
 ; CHECK:       bb15:
-; CHECK-NEXT:    [[TMP16:%.*]] = fmul double [[TMP]], 2.000000e+01
-; CHECK-NEXT:    store double [[TMP16]], ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP17:%.*]] = fmul double [[TMP13]], 3.000000e+01
-; CHECK-NEXT:    store double [[TMP17]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[TMP]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[TMP13]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 2.000000e+01, double 3.000000e+01>
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[TMP9]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -1232,27 +1231,25 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[T19:%.*]] = load ptr, ptr [[ARG:%.*]], align 8
 ; CHECK-NEXT:    [[T20:%.*]] = load float, ptr [[ARG_3:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[T20]], i32 1
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB22:%.*]], label [[BB30:%.*]]
 ; CHECK:       bb22:
 ; CHECK-NEXT:    [[T23:%.*]] = fmul float [[T20]], 9.900000e+01
-; CHECK-NEXT:    [[T24:%.*]] = fmul float [[T23]], 9.900000e+01
 ; CHECK-NEXT:    [[T25:%.*]] = getelementptr inbounds float, ptr [[T19]], i64 2
-; CHECK-NEXT:    [[T26:%.*]] = fmul float [[T23]], 1.000000e+01
-; CHECK-NEXT:    store float [[T26]], ptr [[T25]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[T23]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], <float 9.900000e+01, float 1.000000e+01>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    store float [[TMP4]], ptr [[T25]], align 4
 ; CHECK-NEXT:    [[T27:%.*]] = load float, ptr [[ARG_2:%.*]], align 8
-; CHECK-NEXT:    [[T28:%.*]] = fadd float [[T24]], 2.000000e+01
-; CHECK-NEXT:    [[T29:%.*]] = fadd float [[T26]], 2.000000e+01
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x float> [[TMP3]], <float 2.000000e+01, float 2.000000e+01>
 ; CHECK-NEXT:    br label [[BB30]]
 ; CHECK:       bb30:
-; CHECK-NEXT:    [[T31:%.*]] = phi float [ [[T28]], [[BB22]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[T32:%.*]] = phi float [ [[T29]], [[BB22]] ], [ [[T20]], [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x float> [ [[TMP5]], [[BB22]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[BB36:%.*]]
 ; CHECK:       bb36:
-; CHECK-NEXT:    [[T37:%.*]] = fmul float [[T31]], 3.000000e+00
-; CHECK-NEXT:    store float [[T37]], ptr [[ARG_3]], align 4
-; CHECK-NEXT:    [[T39:%.*]] = fmul float [[T32]], 3.000000e+00
-; CHECK-NEXT:    [[T40:%.*]] = getelementptr inbounds float, ptr [[ARG_3]], i64 1
-; CHECK-NEXT:    store float [[T39]], ptr [[T40]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], <float 3.000000e+00, float 3.000000e+00>
+; CHECK-NEXT:    store <2 x float> [[TMP7]], ptr [[ARG_3]], align 4
 ; CHECK-NEXT:    br label [[BB41:%.*]]
 ; CHECK:       bb41:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
@@ -14,389 +14,168 @@
 ; CHECK-LABEL: @straight(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[ST:%.*]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[P:%.*]], align 2
-; CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i32
-; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]]
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX_1]], align 2
-; CHECK-NEXT:    [[CONV_1:%.*]] = zext i16 [[TMP1]] to i32
-; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i32 [[CONV]], [[CONV_1]]
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul nuw nsw i32 [[CONV_1]], [[CONV_1]]
-; CHECK-NEXT:    [[ADD11_1:%.*]] = add nuw i32 [[MUL_1]], [[MUL]]
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX_2]], align 2
-; CHECK-NEXT:    [[CONV_2:%.*]] = zext i16 [[TMP2]] to i32
-; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[CONV_2]]
-; CHECK-NEXT:    [[MUL_2:%.*]] = mul nuw nsw i32 [[CONV_2]], [[CONV_2]]
-; CHECK-NEXT:    [[ADD11_2:%.*]] = add i32 [[MUL_2]], [[ADD11_1]]
-; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 3
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_3]], align 2
-; CHECK-NEXT:    [[CONV_3:%.*]] = zext i16 [[TMP3]] to i32
-; CHECK-NEXT:    [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[CONV_3]]
-; CHECK-NEXT:    [[MUL_3:%.*]] = mul nuw nsw i32 [[CONV_3]], [[CONV_3]]
-; CHECK-NEXT:    [[ADD11_3:%.*]] = add i32 [[MUL_3]], [[ADD11_2]]
-; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_4]], align 2
-; CHECK-NEXT:    [[CONV_4:%.*]] = zext i16 [[TMP4]] to i32
-; CHECK-NEXT:    [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[CONV_4]]
-; CHECK-NEXT:    [[MUL_4:%.*]] = mul nuw nsw i32 [[CONV_4]], [[CONV_4]]
-; CHECK-NEXT:    [[ADD11_4:%.*]] = add i32 [[MUL_4]], [[ADD11_3]]
-; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 5
-; CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2
-; CHECK-NEXT:    [[CONV_5:%.*]] = zext i16 [[TMP5]] to i32
-; CHECK-NEXT:    [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[CONV_5]]
-; CHECK-NEXT:    [[MUL_5:%.*]] = mul nuw nsw i32 [[CONV_5]], [[CONV_5]]
-; CHECK-NEXT:    [[ADD11_5:%.*]] = add i32 [[MUL_5]], [[ADD11_4]]
-; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 6
-; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2
-; CHECK-NEXT:    [[CONV_6:%.*]] = zext i16 [[TMP6]] to i32
-; CHECK-NEXT:    [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[CONV_6]]
-; CHECK-NEXT:    [[MUL_6:%.*]] = mul nuw nsw i32 [[CONV_6]], [[CONV_6]]
-; CHECK-NEXT:    [[ADD11_6:%.*]] = add i32 [[MUL_6]], [[ADD11_5]]
-; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 7
-; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
-; CHECK-NEXT:    [[CONV_7:%.*]] = zext i16 [[TMP7]] to i32
-; CHECK-NEXT:    [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[CONV_7]]
-; CHECK-NEXT:    [[MUL_7:%.*]] = mul nuw nsw i32 [[CONV_7]], [[CONV_7]]
-; CHECK-NEXT:    [[ADD11_7:%.*]] = add i32 [[MUL_7]], [[ADD11_6]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 2
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ADD_PTR]], align 2
-; CHECK-NEXT:    [[CONV_140:%.*]] = zext i16 [[TMP8]] to i32
-; CHECK-NEXT:    [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[CONV_140]]
-; CHECK-NEXT:    [[MUL_142:%.*]] = mul nuw nsw i32 [[CONV_140]], [[CONV_140]]
-; CHECK-NEXT:    [[ADD11_143:%.*]] = add i32 [[MUL_142]], [[ADD11_7]]
-; CHECK-NEXT:    [[ARRAYIDX_1_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_1_1]], align 2
-; CHECK-NEXT:    [[CONV_1_1:%.*]] = zext i16 [[TMP9]] to i32
-; CHECK-NEXT:    [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[CONV_1_1]]
-; CHECK-NEXT:    [[MUL_1_1:%.*]] = mul nuw nsw i32 [[CONV_1_1]], [[CONV_1_1]]
-; CHECK-NEXT:    [[ADD11_1_1:%.*]] = add i32 [[MUL_1_1]], [[ADD11_143]]
-; CHECK-NEXT:    [[ARRAYIDX_2_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 2
-; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_2_1]], align 2
-; CHECK-NEXT:    [[CONV_2_1:%.*]] = zext i16 [[TMP10]] to i32
-; CHECK-NEXT:    [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[CONV_2_1]]
-; CHECK-NEXT:    [[MUL_2_1:%.*]] = mul nuw nsw i32 [[CONV_2_1]], [[CONV_2_1]]
-; CHECK-NEXT:    [[ADD11_2_1:%.*]] = add i32 [[MUL_2_1]], [[ADD11_1_1]]
-; CHECK-NEXT:    [[ARRAYIDX_3_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 3
-; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_3_1]], align 2
-; CHECK-NEXT:    [[CONV_3_1:%.*]] = zext i16 [[TMP11]] to i32
-; CHECK-NEXT:    [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[CONV_3_1]]
-; CHECK-NEXT:    [[MUL_3_1:%.*]] = mul nuw nsw i32 [[CONV_3_1]], [[CONV_3_1]]
-; CHECK-NEXT:    [[ADD11_3_1:%.*]] = add i32 [[MUL_3_1]], [[ADD11_2_1]]
-; CHECK-NEXT:    [[ARRAYIDX_4_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 4
-; CHECK-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_4_1]], align 2
-; CHECK-NEXT:    [[CONV_4_1:%.*]] = zext i16 [[TMP12]] to i32
-; CHECK-NEXT:    [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[CONV_4_1]]
-; CHECK-NEXT:    [[MUL_4_1:%.*]] = mul nuw nsw i32 [[CONV_4_1]], [[CONV_4_1]]
-; CHECK-NEXT:    [[ADD11_4_1:%.*]] = add i32 [[MUL_4_1]], [[ADD11_3_1]]
-; CHECK-NEXT:    [[ARRAYIDX_5_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 5
-; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_5_1]], align 2
-; CHECK-NEXT:    [[CONV_5_1:%.*]] = zext i16 [[TMP13]] to i32
-; CHECK-NEXT:    [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[CONV_5_1]]
-; CHECK-NEXT:    [[MUL_5_1:%.*]] = mul nuw nsw i32 [[CONV_5_1]], [[CONV_5_1]]
-; CHECK-NEXT:    [[ADD11_5_1:%.*]] = add i32 [[MUL_5_1]], [[ADD11_4_1]]
-; CHECK-NEXT:    [[ARRAYIDX_6_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 6
-; CHECK-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX_6_1]], align 2
-; CHECK-NEXT:    [[CONV_6_1:%.*]] = zext i16 [[TMP14]] to i32
-; CHECK-NEXT:    [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[CONV_6_1]]
-; CHECK-NEXT:    [[MUL_6_1:%.*]] = mul nuw nsw i32 [[CONV_6_1]], [[CONV_6_1]]
-; CHECK-NEXT:    [[ADD11_6_1:%.*]] = add i32 [[MUL_6_1]], [[ADD11_5_1]]
-; CHECK-NEXT:    [[ARRAYIDX_7_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 7
-; CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_7_1]], align 2
-; CHECK-NEXT:    [[CONV_7_1:%.*]] = zext i16 [[TMP15]] to i32
-; CHECK-NEXT:    [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[CONV_7_1]]
-; CHECK-NEXT:    [[MUL_7_1:%.*]] = mul nuw nsw i32 [[CONV_7_1]], [[CONV_7_1]]
-; CHECK-NEXT:    [[ADD11_7_1:%.*]] = add i32 [[MUL_7_1]], [[ADD11_6_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ADD_PTR]], align 2
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i16, ptr [[ADD_PTR_1]], align 2
-; CHECK-NEXT:    [[CONV_244:%.*]] = zext i16 [[TMP16]] to i32
-; CHECK-NEXT:    [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[CONV_244]]
-; CHECK-NEXT:    [[MUL_246:%.*]] = mul nuw nsw i32 [[CONV_244]], [[CONV_244]]
-; CHECK-NEXT:    [[ADD11_247:%.*]] = add i32 [[MUL_246]], [[ADD11_7_1]]
-; CHECK-NEXT:    [[ARRAYIDX_1_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 1
-; CHECK-NEXT:    [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX_1_2]], align 2
-; CHECK-NEXT:    [[CONV_1_2:%.*]] = zext i16 [[TMP17]] to i32
-; CHECK-NEXT:    [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[CONV_1_2]]
-; CHECK-NEXT:    [[MUL_1_2:%.*]] = mul nuw nsw i32 [[CONV_1_2]], [[CONV_1_2]]
-; CHECK-NEXT:    [[ADD11_1_2:%.*]] = add i32 [[MUL_1_2]], [[ADD11_247]]
-; CHECK-NEXT:    [[ARRAYIDX_2_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 2
-; CHECK-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX_2_2]], align 2
-; CHECK-NEXT:    [[CONV_2_2:%.*]] = zext i16 [[TMP18]] to i32
-; CHECK-NEXT:    [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[CONV_2_2]]
-; CHECK-NEXT:    [[MUL_2_2:%.*]] = mul nuw nsw i32 [[CONV_2_2]], [[CONV_2_2]]
-; CHECK-NEXT:    [[ADD11_2_2:%.*]] = add i32 [[MUL_2_2]], [[ADD11_1_2]]
-; CHECK-NEXT:    [[ARRAYIDX_3_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 3
-; CHECK-NEXT:    [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX_3_2]], align 2
-; CHECK-NEXT:    [[CONV_3_2:%.*]] = zext i16 [[TMP19]] to i32
-; CHECK-NEXT:    [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[CONV_3_2]]
-; CHECK-NEXT:    [[MUL_3_2:%.*]] = mul nuw nsw i32 [[CONV_3_2]], [[CONV_3_2]]
-; CHECK-NEXT:    [[ADD11_3_2:%.*]] = add i32 [[MUL_3_2]], [[ADD11_2_2]]
-; CHECK-NEXT:    [[ARRAYIDX_4_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 4
-; CHECK-NEXT:    [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX_4_2]], align 2
-; CHECK-NEXT:    [[CONV_4_2:%.*]] = zext i16 [[TMP20]] to i32
-; CHECK-NEXT:    [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[CONV_4_2]]
-; CHECK-NEXT:    [[MUL_4_2:%.*]] = mul nuw nsw i32 [[CONV_4_2]], [[CONV_4_2]]
-; CHECK-NEXT:    [[ADD11_4_2:%.*]] = add i32 [[MUL_4_2]], [[ADD11_3_2]]
-; CHECK-NEXT:    [[ARRAYIDX_5_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 5
-; CHECK-NEXT:    [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX_5_2]], align 2
-; CHECK-NEXT:    [[CONV_5_2:%.*]] = zext i16 [[TMP21]] to i32
-; CHECK-NEXT:    [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[CONV_5_2]]
-; CHECK-NEXT:    [[MUL_5_2:%.*]] = mul nuw nsw i32 [[CONV_5_2]], [[CONV_5_2]]
-; CHECK-NEXT:    [[ADD11_5_2:%.*]] = add i32 [[MUL_5_2]], [[ADD11_4_2]]
-; CHECK-NEXT:    [[ARRAYIDX_6_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 6
-; CHECK-NEXT:    [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX_6_2]], align 2
-; CHECK-NEXT:    [[CONV_6_2:%.*]] = zext i16 [[TMP22]] to i32
-; CHECK-NEXT:    [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[CONV_6_2]]
-; CHECK-NEXT:    [[MUL_6_2:%.*]] = mul nuw nsw i32 [[CONV_6_2]], [[CONV_6_2]]
-; CHECK-NEXT:    [[ADD11_6_2:%.*]] = add i32 [[MUL_6_2]], [[ADD11_5_2]]
-; CHECK-NEXT:    [[ARRAYIDX_7_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 7
-; CHECK-NEXT:    [[TMP23:%.*]] = load i16, ptr [[ARRAYIDX_7_2]], align 2
-; CHECK-NEXT:    [[CONV_7_2:%.*]] = zext i16 [[TMP23]] to i32
-; CHECK-NEXT:    [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[CONV_7_2]]
-; CHECK-NEXT:    [[MUL_7_2:%.*]] = mul nuw nsw i32 [[CONV_7_2]], [[CONV_7_2]]
-; CHECK-NEXT:    [[ADD11_7_2:%.*]] = add i32 [[MUL_7_2]], [[ADD11_6_2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ADD_PTR_1]], align 2
 ; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load i16, ptr [[ADD_PTR_2]], align 2
-; CHECK-NEXT:    [[CONV_348:%.*]] = zext i16 [[TMP24]] to i32
-; CHECK-NEXT:    [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[CONV_348]]
-; CHECK-NEXT:    [[MUL_350:%.*]] = mul nuw nsw i32 [[CONV_348]], [[CONV_348]]
-; CHECK-NEXT:    [[ADD11_351:%.*]] = add i32 [[MUL_350]], [[ADD11_7_2]]
-; CHECK-NEXT:    [[ARRAYIDX_1_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 1
-; CHECK-NEXT:    [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX_1_3]], align 2
-; CHECK-NEXT:    [[CONV_1_3:%.*]] = zext i16 [[TMP25]] to i32
-; CHECK-NEXT:    [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[CONV_1_3]]
-; CHECK-NEXT:    [[MUL_1_3:%.*]] = mul nuw nsw i32 [[CONV_1_3]], [[CONV_1_3]]
-; CHECK-NEXT:    [[ADD11_1_3:%.*]] = add i32 [[MUL_1_3]], [[ADD11_351]]
-; CHECK-NEXT:    [[ARRAYIDX_2_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 2
-; CHECK-NEXT:    [[TMP26:%.*]] = load i16, ptr [[ARRAYIDX_2_3]], align 2
-; CHECK-NEXT:    [[CONV_2_3:%.*]] = zext i16 [[TMP26]] to i32
-; CHECK-NEXT:    [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[CONV_2_3]]
-; CHECK-NEXT:    [[MUL_2_3:%.*]] = mul nuw nsw i32 [[CONV_2_3]], [[CONV_2_3]]
-; CHECK-NEXT:    [[ADD11_2_3:%.*]] = add i32 [[MUL_2_3]], [[ADD11_1_3]]
-; CHECK-NEXT:    [[ARRAYIDX_3_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 3
-; CHECK-NEXT:    [[TMP27:%.*]] = load i16, ptr [[ARRAYIDX_3_3]], align 2
-; CHECK-NEXT:    [[CONV_3_3:%.*]] = zext i16 [[TMP27]] to i32
-; CHECK-NEXT:    [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[CONV_3_3]]
-; CHECK-NEXT:    [[MUL_3_3:%.*]] = mul nuw nsw i32 [[CONV_3_3]], [[CONV_3_3]]
-; CHECK-NEXT:    [[ADD11_3_3:%.*]] = add i32 [[MUL_3_3]], [[ADD11_2_3]]
-; CHECK-NEXT:    [[ARRAYIDX_4_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 4
-; CHECK-NEXT:    [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX_4_3]], align 2
-; CHECK-NEXT:    [[CONV_4_3:%.*]] = zext i16 [[TMP28]] to i32
-; CHECK-NEXT:    [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[CONV_4_3]]
-; CHECK-NEXT:    [[MUL_4_3:%.*]] = mul nuw nsw i32 [[CONV_4_3]], [[CONV_4_3]]
-; CHECK-NEXT:    [[ADD11_4_3:%.*]] = add i32 [[MUL_4_3]], [[ADD11_3_3]]
-; CHECK-NEXT:    [[ARRAYIDX_5_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 5
-; CHECK-NEXT:    [[TMP29:%.*]] = load i16, ptr [[ARRAYIDX_5_3]], align 2
-; CHECK-NEXT:    [[CONV_5_3:%.*]] = zext i16 [[TMP29]] to i32
-; CHECK-NEXT:    [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[CONV_5_3]]
-; CHECK-NEXT:    [[MUL_5_3:%.*]] = mul nuw nsw i32 [[CONV_5_3]], [[CONV_5_3]]
-; CHECK-NEXT:    [[ADD11_5_3:%.*]] = add i32 [[MUL_5_3]], [[ADD11_4_3]]
-; CHECK-NEXT:    [[ARRAYIDX_6_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 6
-; CHECK-NEXT:    [[TMP30:%.*]] = load i16, ptr [[ARRAYIDX_6_3]], align 2
-; CHECK-NEXT:    [[CONV_6_3:%.*]] = zext i16 [[TMP30]] to i32
-; CHECK-NEXT:    [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[CONV_6_3]]
-; CHECK-NEXT:    [[MUL_6_3:%.*]] = mul nuw nsw i32 [[CONV_6_3]], [[CONV_6_3]]
-; CHECK-NEXT:    [[ADD11_6_3:%.*]] = add i32 [[MUL_6_3]], [[ADD11_5_3]]
-; CHECK-NEXT:    [[ARRAYIDX_7_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 7
-; CHECK-NEXT:    [[TMP31:%.*]] = load i16, ptr [[ARRAYIDX_7_3]], align 2
-; CHECK-NEXT:    [[CONV_7_3:%.*]] = zext i16 [[TMP31]] to i32
-; CHECK-NEXT:    [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[CONV_7_3]]
-; CHECK-NEXT:    [[MUL_7_3:%.*]] = mul nuw nsw i32 [[CONV_7_3]], [[CONV_7_3]]
-; CHECK-NEXT:    [[ADD11_7_3:%.*]] = add i32 [[MUL_7_3]], [[ADD11_6_3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr [[ADD_PTR_2]], align 2
 ; CHECK-NEXT:    [[ADD_PTR_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i16, ptr [[ADD_PTR_3]], align 2
-; CHECK-NEXT:    [[CONV_452:%.*]] = zext i16 [[TMP32]] to i32
-; CHECK-NEXT:    [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[CONV_452]]
-; CHECK-NEXT:    [[MUL_454:%.*]] = mul nuw nsw i32 [[CONV_452]], [[CONV_452]]
-; CHECK-NEXT:    [[ADD11_455:%.*]] = add i32 [[MUL_454]], [[ADD11_7_3]]
-; CHECK-NEXT:    [[ARRAYIDX_1_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i16, ptr [[ARRAYIDX_1_4]], align 2
-; CHECK-NEXT:    [[CONV_1_4:%.*]] = zext i16 [[TMP33]] to i32
-; CHECK-NEXT:    [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[CONV_1_4]]
-; CHECK-NEXT:    [[MUL_1_4:%.*]] = mul nuw nsw i32 [[CONV_1_4]], [[CONV_1_4]]
-; CHECK-NEXT:    [[ADD11_1_4:%.*]] = add i32 [[MUL_1_4]], [[ADD11_455]]
-; CHECK-NEXT:    [[ARRAYIDX_2_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 2
-; CHECK-NEXT:    [[TMP34:%.*]] = load i16, ptr [[ARRAYIDX_2_4]], align 2
-; CHECK-NEXT:    [[CONV_2_4:%.*]] = zext i16 [[TMP34]] to i32
-; CHECK-NEXT:    [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[CONV_2_4]]
-; CHECK-NEXT:    [[MUL_2_4:%.*]] = mul nuw nsw i32 [[CONV_2_4]], [[CONV_2_4]]
-; CHECK-NEXT:    [[ADD11_2_4:%.*]] = add i32 [[MUL_2_4]], [[ADD11_1_4]]
-; CHECK-NEXT:    [[ARRAYIDX_3_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 3
-; CHECK-NEXT:    [[TMP35:%.*]] = load i16, ptr [[ARRAYIDX_3_4]], align 2
-; CHECK-NEXT:    [[CONV_3_4:%.*]] = zext i16 [[TMP35]] to i32
-; CHECK-NEXT:    [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[CONV_3_4]]
-; CHECK-NEXT:    [[MUL_3_4:%.*]] = mul nuw nsw i32 [[CONV_3_4]], [[CONV_3_4]]
-; CHECK-NEXT:    [[ADD11_3_4:%.*]] = add i32 [[MUL_3_4]], [[ADD11_2_4]]
-; CHECK-NEXT:    [[ARRAYIDX_4_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 4
-; CHECK-NEXT:    [[TMP36:%.*]] = load i16, ptr [[ARRAYIDX_4_4]], align 2
-; CHECK-NEXT:    [[CONV_4_4:%.*]] = zext i16 [[TMP36]] to i32
-; CHECK-NEXT:    [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[CONV_4_4]]
-; CHECK-NEXT:    [[MUL_4_4:%.*]] = mul nuw nsw i32 [[CONV_4_4]], [[CONV_4_4]]
-; CHECK-NEXT:    [[ADD11_4_4:%.*]] = add i32 [[MUL_4_4]], [[ADD11_3_4]]
-; CHECK-NEXT:    [[ARRAYIDX_5_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 5
-; CHECK-NEXT:    [[TMP37:%.*]] = load i16, ptr [[ARRAYIDX_5_4]], align 2
-; CHECK-NEXT:    [[CONV_5_4:%.*]] = zext i16 [[TMP37]] to i32
-; CHECK-NEXT:    [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[CONV_5_4]]
-; CHECK-NEXT:    [[MUL_5_4:%.*]] = mul nuw nsw i32 [[CONV_5_4]], [[CONV_5_4]]
-; CHECK-NEXT:    [[ADD11_5_4:%.*]] = add i32 [[MUL_5_4]], [[ADD11_4_4]]
-; CHECK-NEXT:    [[ARRAYIDX_6_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 6
-; CHECK-NEXT:    [[TMP38:%.*]] = load i16, ptr [[ARRAYIDX_6_4]], align 2
-; CHECK-NEXT:    [[CONV_6_4:%.*]] = zext i16 [[TMP38]] to i32
-; CHECK-NEXT:    [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[CONV_6_4]]
-; CHECK-NEXT:    [[MUL_6_4:%.*]] = mul nuw nsw i32 [[CONV_6_4]], [[CONV_6_4]]
-; CHECK-NEXT:    [[ADD11_6_4:%.*]] = add i32 [[MUL_6_4]], [[ADD11_5_4]]
-; CHECK-NEXT:    [[ARRAYIDX_7_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 7
-; CHECK-NEXT:    [[TMP39:%.*]] = load i16, ptr [[ARRAYIDX_7_4]], align 2
-; CHECK-NEXT:    [[CONV_7_4:%.*]] = zext i16 [[TMP39]] to i32
-; CHECK-NEXT:    [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[CONV_7_4]]
-; CHECK-NEXT:    [[MUL_7_4:%.*]] = mul nuw nsw i32 [[CONV_7_4]], [[CONV_7_4]]
-; CHECK-NEXT:    [[ADD11_7_4:%.*]] = add i32 [[MUL_7_4]], [[ADD11_6_4]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ADD_PTR_3]], align 2
 ; CHECK-NEXT:    [[ADD_PTR_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP40:%.*]] = load i16, ptr [[ADD_PTR_4]], align 2
-; CHECK-NEXT:    [[CONV_556:%.*]] = zext i16 [[TMP40]] to i32
-; CHECK-NEXT:    [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[CONV_556]]
-; CHECK-NEXT:    [[MUL_558:%.*]] = mul nuw nsw i32 [[CONV_556]], [[CONV_556]]
-; CHECK-NEXT:    [[ADD11_559:%.*]] = add i32 [[MUL_558]], [[ADD11_7_4]]
-; CHECK-NEXT:    [[ARRAYIDX_1_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i16, ptr [[ARRAYIDX_1_5]], align 2
-; CHECK-NEXT:    [[CONV_1_5:%.*]] = zext i16 [[TMP41]] to i32
-; CHECK-NEXT:    [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[CONV_1_5]]
-; CHECK-NEXT:    [[MUL_1_5:%.*]] = mul nuw nsw i32 [[CONV_1_5]], [[CONV_1_5]]
-; CHECK-NEXT:    [[ADD11_1_5:%.*]] = add i32 [[MUL_1_5]], [[ADD11_559]]
-; CHECK-NEXT:    [[ARRAYIDX_2_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 2
-; CHECK-NEXT:    [[TMP42:%.*]] = load i16, ptr [[ARRAYIDX_2_5]], align 2
-; CHECK-NEXT:    [[CONV_2_5:%.*]] = zext i16 [[TMP42]] to i32
-; CHECK-NEXT:    [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[CONV_2_5]]
-; CHECK-NEXT:    [[MUL_2_5:%.*]] = mul nuw nsw i32 [[CONV_2_5]], [[CONV_2_5]]
-; CHECK-NEXT:    [[ADD11_2_5:%.*]] = add i32 [[MUL_2_5]], [[ADD11_1_5]]
-; CHECK-NEXT:    [[ARRAYIDX_3_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 3
-; CHECK-NEXT:    [[TMP43:%.*]] = load i16, ptr [[ARRAYIDX_3_5]], align 2
-; CHECK-NEXT:    [[CONV_3_5:%.*]] = zext i16 [[TMP43]] to i32
-; CHECK-NEXT:    [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[CONV_3_5]]
-; CHECK-NEXT:    [[MUL_3_5:%.*]] = mul nuw nsw i32 [[CONV_3_5]], [[CONV_3_5]]
-; CHECK-NEXT:    [[ADD11_3_5:%.*]] = add i32 [[MUL_3_5]], [[ADD11_2_5]]
-; CHECK-NEXT:    [[ARRAYIDX_4_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 4
-; CHECK-NEXT:    [[TMP44:%.*]] = load i16, ptr [[ARRAYIDX_4_5]], align 2
-; CHECK-NEXT:    [[CONV_4_5:%.*]] = zext i16 [[TMP44]] to i32
-; CHECK-NEXT:    [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[CONV_4_5]]
-; CHECK-NEXT:    [[MUL_4_5:%.*]] = mul nuw nsw i32 [[CONV_4_5]], [[CONV_4_5]]
-; CHECK-NEXT:    [[ADD11_4_5:%.*]] = add i32 [[MUL_4_5]], [[ADD11_3_5]]
-; CHECK-NEXT:    [[ARRAYIDX_5_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 5
-; CHECK-NEXT:    [[TMP45:%.*]] = load i16, ptr [[ARRAYIDX_5_5]], align 2
-; CHECK-NEXT:    [[CONV_5_5:%.*]] = zext i16 [[TMP45]] to i32
-; CHECK-NEXT:    [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[CONV_5_5]]
-; CHECK-NEXT:    [[MUL_5_5:%.*]] = mul nuw nsw i32 [[CONV_5_5]], [[CONV_5_5]]
-; CHECK-NEXT:    [[ADD11_5_5:%.*]] = add i32 [[MUL_5_5]], [[ADD11_4_5]]
-; CHECK-NEXT:    [[ARRAYIDX_6_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 6
-; CHECK-NEXT:    [[TMP46:%.*]] = load i16, ptr [[ARRAYIDX_6_5]], align 2
-; CHECK-NEXT:    [[CONV_6_5:%.*]] = zext i16 [[TMP46]] to i32
-; CHECK-NEXT:    [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[CONV_6_5]]
-; CHECK-NEXT:    [[MUL_6_5:%.*]] = mul nuw nsw i32 [[CONV_6_5]], [[CONV_6_5]]
-; CHECK-NEXT:    [[ADD11_6_5:%.*]] = add i32 [[MUL_6_5]], [[ADD11_5_5]]
-; CHECK-NEXT:    [[ARRAYIDX_7_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 7
-; CHECK-NEXT:    [[TMP47:%.*]] = load i16, ptr [[ARRAYIDX_7_5]], align 2
-; CHECK-NEXT:    [[CONV_7_5:%.*]] = zext i16 [[TMP47]] to i32
-; CHECK-NEXT:    [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[CONV_7_5]]
-; CHECK-NEXT:    [[MUL_7_5:%.*]] = mul nuw nsw i32 [[CONV_7_5]], [[CONV_7_5]]
-; CHECK-NEXT:    [[ADD11_7_5:%.*]] = add i32 [[MUL_7_5]], [[ADD11_6_5]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr [[ADD_PTR_4]], align 2
 ; CHECK-NEXT:    [[ADD_PTR_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP48:%.*]] = load i16, ptr [[ADD_PTR_5]], align 2
-; CHECK-NEXT:    [[CONV_660:%.*]] = zext i16 [[TMP48]] to i32
-; CHECK-NEXT:    [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[CONV_660]]
-; CHECK-NEXT:    [[MUL_662:%.*]] = mul nuw nsw i32 [[CONV_660]], [[CONV_660]]
-; CHECK-NEXT:    [[ADD11_663:%.*]] = add i32 [[MUL_662]], [[ADD11_7_5]]
-; CHECK-NEXT:    [[ARRAYIDX_1_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i16, ptr [[ARRAYIDX_1_6]], align 2
-; CHECK-NEXT:    [[CONV_1_6:%.*]] = zext i16 [[TMP49]] to i32
-; CHECK-NEXT:    [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[CONV_1_6]]
-; CHECK-NEXT:    [[MUL_1_6:%.*]] = mul nuw nsw i32 [[CONV_1_6]], [[CONV_1_6]]
-; CHECK-NEXT:    [[ADD11_1_6:%.*]] = add i32 [[MUL_1_6]], [[ADD11_663]]
-; CHECK-NEXT:    [[ARRAYIDX_2_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 2
-; CHECK-NEXT:    [[TMP50:%.*]] = load i16, ptr [[ARRAYIDX_2_6]], align 2
-; CHECK-NEXT:    [[CONV_2_6:%.*]] = zext i16 [[TMP50]] to i32
-; CHECK-NEXT:    [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[CONV_2_6]]
-; CHECK-NEXT:    [[MUL_2_6:%.*]] = mul nuw nsw i32 [[CONV_2_6]], [[CONV_2_6]]
-; CHECK-NEXT:    [[ADD11_2_6:%.*]] = add i32 [[MUL_2_6]], [[ADD11_1_6]]
-; CHECK-NEXT:    [[ARRAYIDX_3_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 3
-; CHECK-NEXT:    [[TMP51:%.*]] = load i16, ptr [[ARRAYIDX_3_6]], align 2
-; CHECK-NEXT:    [[CONV_3_6:%.*]] = zext i16 [[TMP51]] to i32
-; CHECK-NEXT:    [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[CONV_3_6]]
-; CHECK-NEXT:    [[MUL_3_6:%.*]] = mul nuw nsw i32 [[CONV_3_6]], [[CONV_3_6]]
-; CHECK-NEXT:    [[ADD11_3_6:%.*]] = add i32 [[MUL_3_6]], [[ADD11_2_6]]
-; CHECK-NEXT:    [[ARRAYIDX_4_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 4
-; CHECK-NEXT:    [[TMP52:%.*]] = load i16, ptr [[ARRAYIDX_4_6]], align 2
-; CHECK-NEXT:    [[CONV_4_6:%.*]] = zext i16 [[TMP52]] to i32
-; CHECK-NEXT:    [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[CONV_4_6]]
-; CHECK-NEXT:    [[MUL_4_6:%.*]] = mul nuw nsw i32 [[CONV_4_6]], [[CONV_4_6]]
-; CHECK-NEXT:    [[ADD11_4_6:%.*]] = add i32 [[MUL_4_6]], [[ADD11_3_6]]
-; CHECK-NEXT:    [[ARRAYIDX_5_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 5
-; CHECK-NEXT:    [[TMP53:%.*]] = load i16, ptr [[ARRAYIDX_5_6]], align 2
-; CHECK-NEXT:    [[CONV_5_6:%.*]] = zext i16 [[TMP53]] to i32
-; CHECK-NEXT:    [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[CONV_5_6]]
-; CHECK-NEXT:    [[MUL_5_6:%.*]] = mul nuw nsw i32 [[CONV_5_6]], [[CONV_5_6]]
-; CHECK-NEXT:    [[ADD11_5_6:%.*]] = add i32 [[MUL_5_6]], [[ADD11_4_6]]
-; CHECK-NEXT:    [[ARRAYIDX_6_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 6
-; CHECK-NEXT:    [[TMP54:%.*]] = load i16, ptr [[ARRAYIDX_6_6]], align 2
-; CHECK-NEXT:    [[CONV_6_6:%.*]] = zext i16 [[TMP54]] to i32
-; CHECK-NEXT:    [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[CONV_6_6]]
-; CHECK-NEXT:    [[MUL_6_6:%.*]] = mul nuw nsw i32 [[CONV_6_6]], [[CONV_6_6]]
-; CHECK-NEXT:    [[ADD11_6_6:%.*]] = add i32 [[MUL_6_6]], [[ADD11_5_6]]
-; CHECK-NEXT:    [[ARRAYIDX_7_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 7
-; CHECK-NEXT:    [[TMP55:%.*]] = load i16, ptr [[ARRAYIDX_7_6]], align 2
-; CHECK-NEXT:    [[CONV_7_6:%.*]] = zext i16 [[TMP55]] to i32
-; CHECK-NEXT:    [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[CONV_7_6]]
-; CHECK-NEXT:    [[MUL_7_6:%.*]] = mul nuw nsw i32 [[CONV_7_6]], [[CONV_7_6]]
-; CHECK-NEXT:    [[ADD11_7_6:%.*]] = add i32 [[MUL_7_6]], [[ADD11_6_6]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2
 ; CHECK-NEXT:    [[ADD_PTR_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP56:%.*]] = load i16, ptr [[ADD_PTR_6]], align 2
-; CHECK-NEXT:    [[CONV_764:%.*]] = zext i16 [[TMP56]] to i32
-; CHECK-NEXT:    [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[CONV_764]]
-; CHECK-NEXT:    [[MUL_766:%.*]] = mul nuw nsw i32 [[CONV_764]], [[CONV_764]]
-; CHECK-NEXT:    [[ADD11_767:%.*]] = add i32 [[MUL_766]], [[ADD11_7_6]]
-; CHECK-NEXT:    [[ARRAYIDX_1_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i16, ptr [[ARRAYIDX_1_7]], align 2
-; CHECK-NEXT:    [[CONV_1_7:%.*]] = zext i16 [[TMP57]] to i32
-; CHECK-NEXT:    [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[CONV_1_7]]
-; CHECK-NEXT:    [[MUL_1_7:%.*]] = mul nuw nsw i32 [[CONV_1_7]], [[CONV_1_7]]
-; CHECK-NEXT:    [[ADD11_1_7:%.*]] = add i32 [[MUL_1_7]], [[ADD11_767]]
-; CHECK-NEXT:    [[ARRAYIDX_2_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 2
-; CHECK-NEXT:    [[TMP58:%.*]] = load i16, ptr [[ARRAYIDX_2_7]], align 2
-; CHECK-NEXT:    [[CONV_2_7:%.*]] = zext i16 [[TMP58]] to i32
-; CHECK-NEXT:    [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[CONV_2_7]]
-; CHECK-NEXT:    [[MUL_2_7:%.*]] = mul nuw nsw i32 [[CONV_2_7]], [[CONV_2_7]]
-; CHECK-NEXT:    [[ADD11_2_7:%.*]] = add i32 [[MUL_2_7]], [[ADD11_1_7]]
-; CHECK-NEXT:    [[ARRAYIDX_3_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 3
-; CHECK-NEXT:    [[TMP59:%.*]] = load i16, ptr [[ARRAYIDX_3_7]], align 2
-; CHECK-NEXT:    [[CONV_3_7:%.*]] = zext i16 [[TMP59]] to i32
-; CHECK-NEXT:    [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[CONV_3_7]]
-; CHECK-NEXT:    [[MUL_3_7:%.*]] = mul nuw nsw i32 [[CONV_3_7]], [[CONV_3_7]]
-; CHECK-NEXT:    [[ADD11_3_7:%.*]] = add i32 [[MUL_3_7]], [[ADD11_2_7]]
-; CHECK-NEXT:    [[ARRAYIDX_4_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 4
-; CHECK-NEXT:    [[TMP60:%.*]] = load i16, ptr [[ARRAYIDX_4_7]], align 2
-; CHECK-NEXT:    [[CONV_4_7:%.*]] = zext i16 [[TMP60]] to i32
-; CHECK-NEXT:    [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[CONV_4_7]]
-; CHECK-NEXT:    [[MUL_4_7:%.*]] = mul nuw nsw i32 [[CONV_4_7]], [[CONV_4_7]]
-; CHECK-NEXT:    [[ADD11_4_7:%.*]] = add i32 [[MUL_4_7]], [[ADD11_3_7]]
-; CHECK-NEXT:    [[ARRAYIDX_5_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 5
-; CHECK-NEXT:    [[TMP61:%.*]] = load i16, ptr [[ARRAYIDX_5_7]], align 2
-; CHECK-NEXT:    [[CONV_5_7:%.*]] = zext i16 [[TMP61]] to i32
-; CHECK-NEXT:    [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[CONV_5_7]]
-; CHECK-NEXT:    [[MUL_5_7:%.*]] = mul nuw nsw i32 [[CONV_5_7]], [[CONV_5_7]]
-; CHECK-NEXT:    [[ADD11_5_7:%.*]] = add i32 [[MUL_5_7]], [[ADD11_4_7]]
-; CHECK-NEXT:    [[ARRAYIDX_6_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 6
-; CHECK-NEXT:    [[TMP62:%.*]] = load i16, ptr [[ARRAYIDX_6_7]], align 2
-; CHECK-NEXT:    [[CONV_6_7:%.*]] = zext i16 [[TMP62]] to i32
-; CHECK-NEXT:    [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[CONV_6_7]]
-; CHECK-NEXT:    [[MUL_6_7:%.*]] = mul nuw nsw i32 [[CONV_6_7]], [[CONV_6_7]]
-; CHECK-NEXT:    [[ADD11_6_7:%.*]] = add i32 [[MUL_6_7]], [[ADD11_5_7]]
-; CHECK-NEXT:    [[ARRAYIDX_7_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 7
-; CHECK-NEXT:    [[TMP63:%.*]] = load i16, ptr [[ARRAYIDX_7_7]], align 2
-; CHECK-NEXT:    [[CONV_7_7:%.*]] = zext i16 [[TMP63]] to i32
-; CHECK-NEXT:    [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[CONV_7_7]]
-; CHECK-NEXT:    [[MUL_7_7:%.*]] = mul nuw nsw i32 [[CONV_7_7]], [[CONV_7_7]]
-; CHECK-NEXT:    [[ADD11_7_7:%.*]] = add i32 [[MUL_7_7]], [[ADD11_6_7]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr [[ADD_PTR_6]], align 2
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <64 x i16> [[TMP8]], <64 x i16> [[TMP9]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <64 x i16> [[TMP10]], <64 x i16> [[TMP11]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <64 x i16> [[TMP12]], <64 x i16> [[TMP13]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <64 x i16> [[TMP14]], <64 x i16> [[TMP15]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <64 x i16> [[TMP16]], <64 x i16> [[TMP17]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <64 x i16> [[TMP18]], <64 x i16> [[TMP19]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <64 x i16> [[TMP20]], <64 x i16> [[TMP21]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71>
+; CHECK-NEXT:    [[TMP23:%.*]] = zext <64 x i16> [[TMP22]] to <64 x i32>
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <64 x i32> [[TMP23]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <64 x i32> [[TMP23]], i32 1
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i32 [[TMP24]], [[TMP25]]
+; CHECK-NEXT:    [[TMP26:%.*]] = mul nuw nsw <64 x i32> [[TMP23]], [[TMP23]]
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <64 x i32> [[TMP23]], i32 2
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[TMP27]]
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <64 x i32> [[TMP23]], i32 3
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[TMP28]]
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <64 x i32> [[TMP23]], i32 4
+; CHECK-NEXT:    [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[TMP29]]
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <64 x i32> [[TMP23]], i32 5
+; CHECK-NEXT:    [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[TMP30]]
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <64 x i32> [[TMP23]], i32 6
+; CHECK-NEXT:    [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[TMP31]]
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <64 x i32> [[TMP23]], i32 7
+; CHECK-NEXT:    [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[TMP32]]
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <64 x i32> [[TMP23]], i32 8
+; CHECK-NEXT:    [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[TMP33]]
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <64 x i32> [[TMP23]], i32 9
+; CHECK-NEXT:    [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[TMP34]]
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <64 x i32> [[TMP23]], i32 10
+; CHECK-NEXT:    [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[TMP35]]
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <64 x i32> [[TMP23]], i32 11
+; CHECK-NEXT:    [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[TMP36]]
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <64 x i32> [[TMP23]], i32 12
+; CHECK-NEXT:    [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[TMP37]]
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <64 x i32> [[TMP23]], i32 13
+; CHECK-NEXT:    [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[TMP38]]
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <64 x i32> [[TMP23]], i32 14
+; CHECK-NEXT:    [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[TMP39]]
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <64 x i32> [[TMP23]], i32 15
+; CHECK-NEXT:    [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[TMP40]]
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <64 x i32> [[TMP23]], i32 16
+; CHECK-NEXT:    [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[TMP41]]
+; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <64 x i32> [[TMP23]], i32 17
+; CHECK-NEXT:    [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[TMP42]]
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <64 x i32> [[TMP23]], i32 18
+; CHECK-NEXT:    [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[TMP43]]
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <64 x i32> [[TMP23]], i32 19
+; CHECK-NEXT:    [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[TMP44]]
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <64 x i32> [[TMP23]], i32 20
+; CHECK-NEXT:    [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[TMP45]]
+; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <64 x i32> [[TMP23]], i32 21
+; CHECK-NEXT:    [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[TMP46]]
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <64 x i32> [[TMP23]], i32 22
+; CHECK-NEXT:    [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[TMP47]]
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <64 x i32> [[TMP23]], i32 23
+; CHECK-NEXT:    [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[TMP48]]
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <64 x i32> [[TMP23]], i32 24
+; CHECK-NEXT:    [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[TMP49]]
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <64 x i32> [[TMP23]], i32 25
+; CHECK-NEXT:    [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[TMP50]]
+; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <64 x i32> [[TMP23]], i32 26
+; CHECK-NEXT:    [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[TMP51]]
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <64 x i32> [[TMP23]], i32 27
+; CHECK-NEXT:    [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[TMP52]]
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <64 x i32> [[TMP23]], i32 28
+; CHECK-NEXT:    [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[TMP53]]
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <64 x i32> [[TMP23]], i32 29
+; CHECK-NEXT:    [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[TMP54]]
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <64 x i32> [[TMP23]], i32 30
+; CHECK-NEXT:    [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[TMP55]]
+; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <64 x i32> [[TMP23]], i32 31
+; CHECK-NEXT:    [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[TMP56]]
+; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <64 x i32> [[TMP23]], i32 32
+; CHECK-NEXT:    [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[TMP57]]
+; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <64 x i32> [[TMP23]], i32 33
+; CHECK-NEXT:    [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[TMP58]]
+; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <64 x i32> [[TMP23]], i32 34
+; CHECK-NEXT:    [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[TMP59]]
+; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <64 x i32> [[TMP23]], i32 35
+; CHECK-NEXT:    [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[TMP60]]
+; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <64 x i32> [[TMP23]], i32 36
+; CHECK-NEXT:    [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[TMP61]]
+; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <64 x i32> [[TMP23]], i32 37
+; CHECK-NEXT:    [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[TMP62]]
+; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <64 x i32> [[TMP23]], i32 38
+; CHECK-NEXT:    [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[TMP63]]
+; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <64 x i32> [[TMP23]], i32 39
+; CHECK-NEXT:    [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[TMP64]]
+; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <64 x i32> [[TMP23]], i32 40
+; CHECK-NEXT:    [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[TMP65]]
+; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <64 x i32> [[TMP23]], i32 41
+; CHECK-NEXT:    [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[TMP66]]
+; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <64 x i32> [[TMP23]], i32 42
+; CHECK-NEXT:    [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[TMP67]]
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <64 x i32> [[TMP23]], i32 43
+; CHECK-NEXT:    [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[TMP68]]
+; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <64 x i32> [[TMP23]], i32 44
+; CHECK-NEXT:    [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[TMP69]]
+; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <64 x i32> [[TMP23]], i32 45
+; CHECK-NEXT:    [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[TMP70]]
+; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <64 x i32> [[TMP23]], i32 46
+; CHECK-NEXT:    [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[TMP71]]
+; CHECK-NEXT:    [[TMP72:%.*]] = extractelement <64 x i32> [[TMP23]], i32 47
+; CHECK-NEXT:    [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[TMP72]]
+; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <64 x i32> [[TMP23]], i32 48
+; CHECK-NEXT:    [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[TMP73]]
+; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <64 x i32> [[TMP23]], i32 49
+; CHECK-NEXT:    [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[TMP74]]
+; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <64 x i32> [[TMP23]], i32 50
+; CHECK-NEXT:    [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[TMP75]]
+; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <64 x i32> [[TMP23]], i32 51
+; CHECK-NEXT:    [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[TMP76]]
+; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <64 x i32> [[TMP23]], i32 52
+; CHECK-NEXT:    [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[TMP77]]
+; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <64 x i32> [[TMP23]], i32 53
+; CHECK-NEXT:    [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[TMP78]]
+; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <64 x i32> [[TMP23]], i32 54
+; CHECK-NEXT:    [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[TMP79]]
+; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <64 x i32> [[TMP23]], i32 55
+; CHECK-NEXT:    [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[TMP80]]
+; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <64 x i32> [[TMP23]], i32 56
+; CHECK-NEXT:    [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[TMP81]]
+; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <64 x i32> [[TMP23]], i32 57
+; CHECK-NEXT:    [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[TMP82]]
+; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <64 x i32> [[TMP23]], i32 58
+; CHECK-NEXT:    [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[TMP83]]
+; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <64 x i32> [[TMP23]], i32 59
+; CHECK-NEXT:    [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[TMP84]]
+; CHECK-NEXT:    [[TMP85:%.*]] = extractelement <64 x i32> [[TMP23]], i32 60
+; CHECK-NEXT:    [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[TMP85]]
+; CHECK-NEXT:    [[TMP86:%.*]] = extractelement <64 x i32> [[TMP23]], i32 61
+; CHECK-NEXT:    [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[TMP86]]
+; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <64 x i32> [[TMP23]], i32 62
+; CHECK-NEXT:    [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[TMP87]]
+; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <64 x i32> [[TMP23]], i32 63
+; CHECK-NEXT:    [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[TMP88]]
+; CHECK-NEXT:    [[TMP89:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP26]])
 ; CHECK-NEXT:    [[CONV15:%.*]] = zext i32 [[ADD_7_7]] to i64
-; CHECK-NEXT:    [[CONV16:%.*]] = zext i32 [[ADD11_7_7]] to i64
+; CHECK-NEXT:    [[CONV16:%.*]] = zext i32 [[TMP89]] to i64
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i64 [[CONV16]], 32
 ; CHECK-NEXT:    [[ADD17:%.*]] = or i64 [[SHL]], [[CONV15]]
 ; CHECK-NEXT:    ret i64 [[ADD17]]
@@ -798,111 +577,23 @@
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
 ; CHECK:       for.cond1.preheader:
 ; CHECK-NEXT:    [[Y_038:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC13:%.*]], [[FOR_COND1_PREHEADER]] ]
-; CHECK-NEXT:    [[SQ_037:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD11_15:%.*]], [[FOR_COND1_PREHEADER]] ]
-; CHECK-NEXT:    [[SM_036:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD_15:%.*]], [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT:    [[SQ_037:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_RDX:%.*]], [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT:    [[SM_036:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_RDX1:%.*]], [[FOR_COND1_PREHEADER]] ]
 ; CHECK-NEXT:    [[P_ADDR_035:%.*]] = phi ptr [ [[P:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[P_ADDR_035]], align 2
-; CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i32
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SM_036]], [[CONV]]
-; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]]
-; CHECK-NEXT:    [[ADD11:%.*]] = add i32 [[MUL]], [[SQ_037]]
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX_1]], align 2
-; CHECK-NEXT:    [[CONV_1:%.*]] = zext i16 [[TMP1]] to i32
-; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[ADD]], [[CONV_1]]
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul nuw nsw i32 [[CONV_1]], [[CONV_1]]
-; CHECK-NEXT:    [[ADD11_1:%.*]] = add i32 [[MUL_1]], [[ADD11]]
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX_2]], align 2
-; CHECK-NEXT:    [[CONV_2:%.*]] = zext i16 [[TMP2]] to i32
-; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[ADD_1]], [[CONV_2]]
-; CHECK-NEXT:    [[MUL_2:%.*]] = mul nuw nsw i32 [[CONV_2]], [[CONV_2]]
-; CHECK-NEXT:    [[ADD11_2:%.*]] = add i32 [[MUL_2]], [[ADD11_1]]
-; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 3
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_3]], align 2
-; CHECK-NEXT:    [[CONV_3:%.*]] = zext i16 [[TMP3]] to i32
-; CHECK-NEXT:    [[ADD_3:%.*]] = add i32 [[ADD_2]], [[CONV_3]]
-; CHECK-NEXT:    [[MUL_3:%.*]] = mul nuw nsw i32 [[CONV_3]], [[CONV_3]]
-; CHECK-NEXT:    [[ADD11_3:%.*]] = add i32 [[MUL_3]], [[ADD11_2]]
-; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_4]], align 2
-; CHECK-NEXT:    [[CONV_4:%.*]] = zext i16 [[TMP4]] to i32
-; CHECK-NEXT:    [[ADD_4:%.*]] = add i32 [[ADD_3]], [[CONV_4]]
-; CHECK-NEXT:    [[MUL_4:%.*]] = mul nuw nsw i32 [[CONV_4]], [[CONV_4]]
-; CHECK-NEXT:    [[ADD11_4:%.*]] = add i32 [[MUL_4]], [[ADD11_3]]
-; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 5
-; CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2
-; CHECK-NEXT:    [[CONV_5:%.*]] = zext i16 [[TMP5]] to i32
-; CHECK-NEXT:    [[ADD_5:%.*]] = add i32 [[ADD_4]], [[CONV_5]]
-; CHECK-NEXT:    [[MUL_5:%.*]] = mul nuw nsw i32 [[CONV_5]], [[CONV_5]]
-; CHECK-NEXT:    [[ADD11_5:%.*]] = add i32 [[MUL_5]], [[ADD11_4]]
-; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 6
-; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2
-; CHECK-NEXT:    [[CONV_6:%.*]] = zext i16 [[TMP6]] to i32
-; CHECK-NEXT:    [[ADD_6:%.*]] = add i32 [[ADD_5]], [[CONV_6]]
-; CHECK-NEXT:    [[MUL_6:%.*]] = mul nuw nsw i32 [[CONV_6]], [[CONV_6]]
-; CHECK-NEXT:    [[ADD11_6:%.*]] = add i32 [[MUL_6]], [[ADD11_5]]
-; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 7
-; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
-; CHECK-NEXT:    [[CONV_7:%.*]] = zext i16 [[TMP7]] to i32
-; CHECK-NEXT:    [[ADD_7:%.*]] = add i32 [[ADD_6]], [[CONV_7]]
-; CHECK-NEXT:    [[MUL_7:%.*]] = mul nuw nsw i32 [[CONV_7]], [[CONV_7]]
-; CHECK-NEXT:    [[ADD11_7:%.*]] = add i32 [[MUL_7]], [[ADD11_6]]
-; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_8]], align 2
-; CHECK-NEXT:    [[CONV_8:%.*]] = zext i16 [[TMP8]] to i32
-; CHECK-NEXT:    [[ADD_8:%.*]] = add i32 [[ADD_7]], [[CONV_8]]
-; CHECK-NEXT:    [[MUL_8:%.*]] = mul nuw nsw i32 [[CONV_8]], [[CONV_8]]
-; CHECK-NEXT:    [[ADD11_8:%.*]] = add i32 [[MUL_8]], [[ADD11_7]]
-; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 9
-; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_9]], align 2
-; CHECK-NEXT:    [[CONV_9:%.*]] = zext i16 [[TMP9]] to i32
-; CHECK-NEXT:    [[ADD_9:%.*]] = add i32 [[ADD_8]], [[CONV_9]]
-; CHECK-NEXT:    [[MUL_9:%.*]] = mul nuw nsw i32 [[CONV_9]], [[CONV_9]]
-; CHECK-NEXT:    [[ADD11_9:%.*]] = add i32 [[MUL_9]], [[ADD11_8]]
-; CHECK-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 10
-; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_10]], align 2
-; CHECK-NEXT:    [[CONV_10:%.*]] = zext i16 [[TMP10]] to i32
-; CHECK-NEXT:    [[ADD_10:%.*]] = add i32 [[ADD_9]], [[CONV_10]]
-; CHECK-NEXT:    [[MUL_10:%.*]] = mul nuw nsw i32 [[CONV_10]], [[CONV_10]]
-; CHECK-NEXT:    [[ADD11_10:%.*]] = add i32 [[MUL_10]], [[ADD11_9]]
-; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 11
-; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_11]], align 2
-; CHECK-NEXT:    [[CONV_11:%.*]] = zext i16 [[TMP11]] to i32
-; CHECK-NEXT:    [[ADD_11:%.*]] = add i32 [[ADD_10]], [[CONV_11]]
-; CHECK-NEXT:    [[MUL_11:%.*]] = mul nuw nsw i32 [[CONV_11]], [[CONV_11]]
-; CHECK-NEXT:    [[ADD11_11:%.*]] = add i32 [[MUL_11]], [[ADD11_10]]
-; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 12
-; CHECK-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_12]], align 2
-; CHECK-NEXT:    [[CONV_12:%.*]] = zext i16 [[TMP12]] to i32
-; CHECK-NEXT:    [[ADD_12:%.*]] = add i32 [[ADD_11]], [[CONV_12]]
-; CHECK-NEXT:    [[MUL_12:%.*]] = mul nuw nsw i32 [[CONV_12]], [[CONV_12]]
-; CHECK-NEXT:    [[ADD11_12:%.*]] = add i32 [[MUL_12]], [[ADD11_11]]
-; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 13
-; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_13]], align 2
-; CHECK-NEXT:    [[CONV_13:%.*]] = zext i16 [[TMP13]] to i32
-; CHECK-NEXT:    [[ADD_13:%.*]] = add i32 [[ADD_12]], [[CONV_13]]
-; CHECK-NEXT:    [[MUL_13:%.*]] = mul nuw nsw i32 [[CONV_13]], [[CONV_13]]
-; CHECK-NEXT:    [[ADD11_13:%.*]] = add i32 [[MUL_13]], [[ADD11_12]]
-; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 14
-; CHECK-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX_14]], align 2
-; CHECK-NEXT:    [[CONV_14:%.*]] = zext i16 [[TMP14]] to i32
-; CHECK-NEXT:    [[ADD_14:%.*]] = add i32 [[ADD_13]], [[CONV_14]]
-; CHECK-NEXT:    [[MUL_14:%.*]] = mul nuw nsw i32 [[CONV_14]], [[CONV_14]]
-; CHECK-NEXT:    [[ADD11_14:%.*]] = add i32 [[MUL_14]], [[ADD11_13]]
-; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 15
-; CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_15]], align 2
-; CHECK-NEXT:    [[CONV_15:%.*]] = zext i16 [[TMP15]] to i32
-; CHECK-NEXT:    [[ADD_15]] = add i32 [[ADD_14]], [[CONV_15]]
-; CHECK-NEXT:    [[MUL_15:%.*]] = mul nuw nsw i32 [[CONV_15]], [[CONV_15]]
-; CHECK-NEXT:    [[ADD11_15]] = add i32 [[MUL_15]], [[ADD11_14]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i16>, ptr [[P_ADDR_035]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <16 x i16> [[TMP0]] to <16 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw <16 x i32> [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP1]])
+; CHECK-NEXT:    [[OP_RDX1]] = add i32 [[TMP3]], [[SM_036]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP2]])
+; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP4]], [[SQ_037]]
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[INC13]] = add nuw nsw i32 [[Y_038]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC13]], 16
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[CONV15:%.*]] = zext i32 [[ADD_15]] to i64
-; CHECK-NEXT:    [[CONV16:%.*]] = zext i32 [[ADD11_15]] to i64
+; CHECK-NEXT:    [[CONV15:%.*]] = zext i32 [[OP_RDX1]] to i64
+; CHECK-NEXT:    [[CONV16:%.*]] = zext i32 [[OP_RDX]] to i64
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i64 [[CONV16]], 32
 ; CHECK-NEXT:    [[ADD17:%.*]] = or i64 [[SHL]], [[CONV15]]
 ; CHECK-NEXT:    ret i64 [[ADD17]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
@@ -206,22 +206,22 @@
 define float @slp_not_profitable_in_loop(float %x, ptr %A) {
 ; CHECK-LABEL: @slp_not_profitable_in_loop(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1
-; CHECK-NEXT:    [[L_0:%.*]] = load float, ptr [[GEP_A_1]], align 4
-; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2
+; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 2
 ; CHECK-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_A_2]], align 4
-; CHECK-NEXT:    [[L_2:%.*]] = load float, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
 ; CHECK-NEXT:    [[L_3:%.*]] = load float, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+00>, float [[X:%.*]], i32 0
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[MUL11:%.*]] = fmul fast float 3.000000e+00, [[L_0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
 ; CHECK-NEXT:    [[MUL12:%.*]] = fmul fast float 3.000000e+00, [[L_1]]
-; CHECK-NEXT:    [[MUL14:%.*]] = fmul fast float [[X:%.*]], [[L_2]]
 ; CHECK-NEXT:    [[MUL16:%.*]] = fmul fast float 3.000000e+00, [[L_3]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL12]], [[MUL11]]
-; CHECK-NEXT:    [[ADD13:%.*]] = fadd fast float [[ADD]], [[MUL14]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL12]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[ADD13:%.*]] = fadd fast float [[ADD]], [[TMP4]]
 ; CHECK-NEXT:    [[RED_NEXT]] = fadd fast float [[ADD13]], [[MUL16]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV]], 10
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
--- a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
@@ -505,9 +505,9 @@
 define i32 @load_multiple_extracts_with_constant_idx(ptr %x) {
 ; CHECK-LABEL: @load_multiple_extracts_with_constant_idx(
 ; CHECK-NEXT:    [[LV:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[LV]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[LV]], [[SHIFT]]
-; CHECK-NEXT:    [[RES:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[E_0:%.*]] = extractelement <4 x i32> [[LV]], i32 0
+; CHECK-NEXT:    [[E_1:%.*]] = extractelement <4 x i32> [[LV]], i32 1
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[E_0]], [[E_1]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %lv = load <4 x i32>, ptr %x
@@ -521,10 +521,9 @@
 ; because the vector large vector requires 2 vector registers.
 define i32 @load_multiple_extracts_with_constant_idx_profitable(ptr %x) {
 ; CHECK-LABEL: @load_multiple_extracts_with_constant_idx_profitable(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <8 x i32>, ptr [[X:%.*]], i32 0, i32 0
-; CHECK-NEXT:    [[E_0:%.*]] = load i32, ptr [[TMP1]], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i32>, ptr [[X]], i32 0, i32 6
-; CHECK-NEXT:    [[E_1:%.*]] = load i32, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[LV:%.*]] = load <8 x i32>, ptr [[X:%.*]], align 16
+; CHECK-NEXT:    [[E_0:%.*]] = extractelement <8 x i32> [[LV]], i32 0
+; CHECK-NEXT:    [[E_1:%.*]] = extractelement <8 x i32> [[LV]], i32 6
 ; CHECK-NEXT:    [[RES:%.*]] = add i32 [[E_0]], [[E_1]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;